1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13#include "./vpx_config.h" 14#include "vpx/vpx_integer.h" 15#include "vp9/common/vp9_common.h" 16#include "vp9/common/vp9_idct.h" 17 18#define RECON_AND_STORE4X4(dest, in_x) \ 19{ \ 20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 21 d0 = _mm_unpacklo_epi8(d0, zero); \ 22 d0 = _mm_add_epi16(in_x, d0); \ 23 d0 = _mm_packus_epi16(d0, d0); \ 24 *(int *)dest = _mm_cvtsi128_si32(d0); \ 25 dest += stride; \ 26} 27 28void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 29 const __m128i zero = _mm_setzero_si128(); 30 const __m128i eight = _mm_set1_epi16(8); 31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 34 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 36 __m128i input0, input1, input2, input3; 37 38 // Rows 39 input0 = _mm_load_si128((const __m128i *)input); 40 input2 = _mm_load_si128((const __m128i *)(input + 8)); 41 42 // Construct i3, i1, i3, i1, i2, i0, i2, i0 43 input0 = _mm_shufflelo_epi16(input0, 0xd8); 44 input0 = _mm_shufflehi_epi16(input0, 0xd8); 45 input2 = _mm_shufflelo_epi16(input2, 0xd8); 46 input2 = _mm_shufflehi_epi16(input2, 0xd8); 47 48 input1 = _mm_unpackhi_epi32(input0, input0); 49 input0 = _mm_unpacklo_epi32(input0, input0); 50 input3 = _mm_unpackhi_epi32(input2, input2); 51 input2 = _mm_unpacklo_epi32(input2, input2); 52 53 // Stage 1 54 input0 = _mm_madd_epi16(input0, cst); 55 input1 = _mm_madd_epi16(input1, cst); 56 input2 = _mm_madd_epi16(input2, cst); 57 input3 = _mm_madd_epi16(input3, cst); 58 59 input0 = _mm_add_epi32(input0, rounding); 60 input1 = _mm_add_epi32(input1, rounding); 61 input2 = _mm_add_epi32(input2, rounding); 62 input3 = _mm_add_epi32(input3, rounding); 63 64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 68 69 // Stage 2 70 input0 = _mm_packs_epi32(input0, input1); 71 input1 = _mm_packs_epi32(input2, input3); 72 73 // Transpose 74 input2 = _mm_unpacklo_epi16(input0, input1); 75 input3 = _mm_unpackhi_epi16(input0, input1); 76 input0 = _mm_unpacklo_epi32(input2, input3); 77 input1 = _mm_unpackhi_epi32(input2, input3); 78 79 // Switch column2, column 3, and then, we got: 80 // input2: column1, column 0; input3: column2, column 3. 81 input1 = _mm_shuffle_epi32(input1, 0x4e); 82 input2 = _mm_add_epi16(input0, input1); 83 input3 = _mm_sub_epi16(input0, input1); 84 85 // Columns 86 // Construct i3, i1, i3, i1, i2, i0, i2, i0 87 input0 = _mm_unpacklo_epi32(input2, input2); 88 input1 = _mm_unpackhi_epi32(input2, input2); 89 input2 = _mm_unpackhi_epi32(input3, input3); 90 input3 = _mm_unpacklo_epi32(input3, input3); 91 92 // Stage 1 93 input0 = _mm_madd_epi16(input0, cst); 94 input1 = _mm_madd_epi16(input1, cst); 95 input2 = _mm_madd_epi16(input2, cst); 96 input3 = _mm_madd_epi16(input3, cst); 97 98 input0 = _mm_add_epi32(input0, rounding); 99 input1 = _mm_add_epi32(input1, rounding); 100 input2 = _mm_add_epi32(input2, rounding); 101 input3 = _mm_add_epi32(input3, rounding); 102 103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 107 108 // Stage 2 109 input0 = _mm_packs_epi32(input0, input2); 110 input1 = _mm_packs_epi32(input1, input3); 111 112 // Transpose 113 input2 = _mm_unpacklo_epi16(input0, input1); 114 input3 = _mm_unpackhi_epi16(input0, input1); 115 input0 = _mm_unpacklo_epi32(input2, input3); 116 input1 = _mm_unpackhi_epi32(input2, input3); 117 118 // Switch column2, column 3, and then, we got: 119 // input2: column1, column 0; input3: column2, column 3. 120 input1 = _mm_shuffle_epi32(input1, 0x4e); 121 input2 = _mm_add_epi16(input0, input1); 122 input3 = _mm_sub_epi16(input0, input1); 123 124 // Final round and shift 125 input2 = _mm_add_epi16(input2, eight); 126 input3 = _mm_add_epi16(input3, eight); 127 128 input2 = _mm_srai_epi16(input2, 4); 129 input3 = _mm_srai_epi16(input3, 4); 130 131 // Reconstruction and Store 132 { 133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 135 d0 = _mm_unpacklo_epi32(d0, 136 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 138 *(const int *) (dest + stride * 3)), d2); 139 d0 = _mm_unpacklo_epi8(d0, zero); 140 d2 = _mm_unpacklo_epi8(d2, zero); 141 d0 = _mm_add_epi16(d0, input2); 142 d2 = _mm_add_epi16(d2, input3); 143 d0 = _mm_packus_epi16(d0, d2); 144 // store input0 145 *(int *)dest = _mm_cvtsi128_si32(d0); 146 // store input1 147 d0 = _mm_srli_si128(d0, 4); 148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 149 // store input2 150 d0 = _mm_srli_si128(d0, 4); 151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 152 // store input3 153 d0 = _mm_srli_si128(d0, 4); 154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 155 } 156} 157 158void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 159 __m128i dc_value; 160 const __m128i zero = _mm_setzero_si128(); 161 int a; 162 163 a = dct_const_round_shift(input[0] * cospi_16_64); 164 a = dct_const_round_shift(a * cospi_16_64); 165 a = ROUND_POWER_OF_TWO(a, 4); 166 167 dc_value = _mm_set1_epi16(a); 168 169 RECON_AND_STORE4X4(dest, dc_value); 170 RECON_AND_STORE4X4(dest, dc_value); 171 RECON_AND_STORE4X4(dest, dc_value); 172 RECON_AND_STORE4X4(dest, dc_value); 173} 174 175static INLINE void transpose_4x4(__m128i *res) { 176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 177 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 178 179 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 180 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 181} 182 183static void idct4_sse2(__m128i *in) { 184 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 185 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 186 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 187 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 188 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 189 __m128i u[8], v[8]; 190 191 transpose_4x4(in); 192 // stage 1 193 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 194 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 195 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 196 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 197 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 198 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 199 200 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 201 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 202 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 203 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 204 205 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 206 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 207 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 208 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 209 210 u[0] = _mm_packs_epi32(v[0], v[1]); 211 u[1] = _mm_packs_epi32(v[3], v[2]); 212 213 // stage 2 214 in[0] = _mm_add_epi16(u[0], u[1]); 215 in[1] = _mm_sub_epi16(u[0], u[1]); 216 in[1] = _mm_shuffle_epi32(in[1], 0x4E); 217} 218 219static void iadst4_sse2(__m128i *in) { 220 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 221 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 222 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 223 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 224 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 225 const __m128i kZero = _mm_set1_epi16(0); 226 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 227 __m128i u[8], v[8], in7; 228 229 transpose_4x4(in); 230 in7 = _mm_srli_si128(in[1], 8); 231 in7 = _mm_add_epi16(in7, in[0]); 232 in7 = _mm_sub_epi16(in7, in[1]); 233 234 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 235 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 236 u[2] = _mm_unpacklo_epi16(in7, kZero); 237 u[3] = _mm_unpackhi_epi16(in[0], kZero); 238 239 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 240 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 241 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 242 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 243 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 244 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 245 246 u[0] = _mm_add_epi32(v[0], v[1]); 247 u[1] = _mm_add_epi32(v[3], v[4]); 248 u[2] = v[2]; 249 u[3] = _mm_add_epi32(u[0], u[1]); 250 u[4] = _mm_slli_epi32(v[5], 2); 251 u[5] = _mm_add_epi32(u[3], v[5]); 252 u[6] = _mm_sub_epi32(u[5], u[4]); 253 254 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 255 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 256 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 257 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 258 259 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 260 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 261 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 262 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 263 264 in[0] = _mm_packs_epi32(u[0], u[1]); 265 in[1] = _mm_packs_epi32(u[2], u[3]); 266} 267 268void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 269 int tx_type) { 270 __m128i in[2]; 271 const __m128i zero = _mm_setzero_si128(); 272 const __m128i eight = _mm_set1_epi16(8); 273 274 in[0]= _mm_loadu_si128((const __m128i *)(input)); 275 in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); 276 277 switch (tx_type) { 278 case 0: // DCT_DCT 279 idct4_sse2(in); 280 idct4_sse2(in); 281 break; 282 case 1: // ADST_DCT 283 idct4_sse2(in); 284 iadst4_sse2(in); 285 break; 286 case 2: // DCT_ADST 287 iadst4_sse2(in); 288 idct4_sse2(in); 289 break; 290 case 3: // ADST_ADST 291 iadst4_sse2(in); 292 iadst4_sse2(in); 293 break; 294 default: 295 assert(0); 296 break; 297 } 298 299 // Final round and shift 300 in[0] = _mm_add_epi16(in[0], eight); 301 in[1] = _mm_add_epi16(in[1], eight); 302 303 in[0] = _mm_srai_epi16(in[0], 4); 304 in[1] = _mm_srai_epi16(in[1], 4); 305 306 // Reconstruction and Store 307 { 308 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 309 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 310 d0 = _mm_unpacklo_epi32(d0, 311 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 312 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( 313 *(const int *) (dest + stride * 3))); 314 d0 = _mm_unpacklo_epi8(d0, zero); 315 d2 = _mm_unpacklo_epi8(d2, zero); 316 d0 = _mm_add_epi16(d0, in[0]); 317 d2 = _mm_add_epi16(d2, in[1]); 318 d0 = _mm_packus_epi16(d0, d2); 319 // store result[0] 320 *(int *)dest = _mm_cvtsi128_si32(d0); 321 // store result[1] 322 d0 = _mm_srli_si128(d0, 4); 323 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 324 // store result[2] 325 d0 = _mm_srli_si128(d0, 4); 326 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 327 // store result[3] 328 d0 = _mm_srli_si128(d0, 4); 329 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 330 } 331} 332 333#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 334 out0, out1, out2, out3, out4, out5, out6, out7) \ 335 { \ 336 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 337 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 338 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 339 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 340 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 341 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 342 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 343 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 344 \ 345 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 346 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 347 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 348 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 349 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 350 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 351 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 352 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 353 \ 354 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 355 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 356 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 357 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 358 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 359 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 360 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 361 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 362 } 363 364#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ 365 out0, out1, out2, out3) \ 366 { \ 367 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ 368 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ 369 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ 370 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ 371 \ 372 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 373 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 374 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 375 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 376 \ 377 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 378 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 379 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 380 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 381 } 382 383#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ 384 { \ 385 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 386 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 387 \ 388 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 389 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 390 } 391 392#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ 393 { \ 394 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 395 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 396 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 397 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 398 } 399 400// Define Macro for multiplying elements by constants and adding them together. 401#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 402 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 403 { \ 404 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 405 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 406 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 407 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 408 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 409 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 410 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 411 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 412 \ 413 tmp0 = _mm_add_epi32(tmp0, rounding); \ 414 tmp1 = _mm_add_epi32(tmp1, rounding); \ 415 tmp2 = _mm_add_epi32(tmp2, rounding); \ 416 tmp3 = _mm_add_epi32(tmp3, rounding); \ 417 tmp4 = _mm_add_epi32(tmp4, rounding); \ 418 tmp5 = _mm_add_epi32(tmp5, rounding); \ 419 tmp6 = _mm_add_epi32(tmp6, rounding); \ 420 tmp7 = _mm_add_epi32(tmp7, rounding); \ 421 \ 422 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 423 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 424 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 425 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 426 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 427 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 428 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 429 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 430 \ 431 res0 = _mm_packs_epi32(tmp0, tmp1); \ 432 res1 = _mm_packs_epi32(tmp2, tmp3); \ 433 res2 = _mm_packs_epi32(tmp4, tmp5); \ 434 res3 = _mm_packs_epi32(tmp6, tmp7); \ 435 } 436 437#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 438 { \ 439 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 440 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 441 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 442 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 443 \ 444 tmp0 = _mm_add_epi32(tmp0, rounding); \ 445 tmp1 = _mm_add_epi32(tmp1, rounding); \ 446 tmp2 = _mm_add_epi32(tmp2, rounding); \ 447 tmp3 = _mm_add_epi32(tmp3, rounding); \ 448 \ 449 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 450 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 451 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 452 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 453 \ 454 res0 = _mm_packs_epi32(tmp0, tmp1); \ 455 res1 = _mm_packs_epi32(tmp2, tmp3); \ 456 } 457 458#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ 459 out0, out1, out2, out3, out4, out5, out6, out7) \ 460 { \ 461 /* Stage1 */ \ 462 { \ 463 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 464 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 465 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 466 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 467 \ 468 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 469 stg1_1, stg1_2, stg1_3, stp1_4, \ 470 stp1_7, stp1_5, stp1_6) \ 471 } \ 472 \ 473 /* Stage2 */ \ 474 { \ 475 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 476 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 477 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 478 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 479 \ 480 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 481 stg2_1, stg2_2, stg2_3, stp2_0, \ 482 stp2_1, stp2_2, stp2_3) \ 483 \ 484 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 485 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 486 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 487 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 488 } \ 489 \ 490 /* Stage3 */ \ 491 { \ 492 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 493 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 494 \ 495 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 496 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 497 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 498 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 499 \ 500 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 501 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 502 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 503 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 504 \ 505 tmp0 = _mm_add_epi32(tmp0, rounding); \ 506 tmp1 = _mm_add_epi32(tmp1, rounding); \ 507 tmp2 = _mm_add_epi32(tmp2, rounding); \ 508 tmp3 = _mm_add_epi32(tmp3, rounding); \ 509 \ 510 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 511 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 512 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 513 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 514 \ 515 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 516 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 517 } \ 518 \ 519 /* Stage4 */ \ 520 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 521 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 522 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 523 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 524 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 525 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 526 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 527 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 528 } 529 530#define RECON_AND_STORE(dest, in_x) \ 531 { \ 532 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 533 d0 = _mm_unpacklo_epi8(d0, zero); \ 534 d0 = _mm_add_epi16(in_x, d0); \ 535 d0 = _mm_packus_epi16(d0, d0); \ 536 _mm_storel_epi64((__m128i *)(dest), d0); \ 537 dest += stride; \ 538 } 539 540void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 541 const __m128i zero = _mm_setzero_si128(); 542 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 543 const __m128i final_rounding = _mm_set1_epi16(1<<4); 544 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 545 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 546 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 547 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 552 553 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 557 int i; 558 559 // Load input data. 560 in0 = _mm_load_si128((const __m128i *)input); 561 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 562 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 563 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 564 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 565 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 566 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 567 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 568 569 // 2-D 570 for (i = 0; i < 2; i++) { 571 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 572 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 573 in0, in1, in2, in3, in4, in5, in6, in7); 574 575 // 4-stage 1D idct8x8 576 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 577 in0, in1, in2, in3, in4, in5, in6, in7); 578 } 579 580 // Final rounding and shift 581 in0 = _mm_adds_epi16(in0, final_rounding); 582 in1 = _mm_adds_epi16(in1, final_rounding); 583 in2 = _mm_adds_epi16(in2, final_rounding); 584 in3 = _mm_adds_epi16(in3, final_rounding); 585 in4 = _mm_adds_epi16(in4, final_rounding); 586 in5 = _mm_adds_epi16(in5, final_rounding); 587 in6 = _mm_adds_epi16(in6, final_rounding); 588 in7 = _mm_adds_epi16(in7, final_rounding); 589 590 in0 = _mm_srai_epi16(in0, 5); 591 in1 = _mm_srai_epi16(in1, 5); 592 in2 = _mm_srai_epi16(in2, 5); 593 in3 = _mm_srai_epi16(in3, 5); 594 in4 = _mm_srai_epi16(in4, 5); 595 in5 = _mm_srai_epi16(in5, 5); 596 in6 = _mm_srai_epi16(in6, 5); 597 in7 = _mm_srai_epi16(in7, 5); 598 599 RECON_AND_STORE(dest, in0); 600 RECON_AND_STORE(dest, in1); 601 RECON_AND_STORE(dest, in2); 602 RECON_AND_STORE(dest, in3); 603 RECON_AND_STORE(dest, in4); 604 RECON_AND_STORE(dest, in5); 605 RECON_AND_STORE(dest, in6); 606 RECON_AND_STORE(dest, in7); 607} 608 609void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 610 __m128i dc_value; 611 const __m128i zero = _mm_setzero_si128(); 612 int a; 613 614 a = dct_const_round_shift(input[0] * cospi_16_64); 615 a = dct_const_round_shift(a * cospi_16_64); 616 a = ROUND_POWER_OF_TWO(a, 5); 617 618 dc_value = _mm_set1_epi16(a); 619 620 RECON_AND_STORE(dest, dc_value); 621 RECON_AND_STORE(dest, dc_value); 622 RECON_AND_STORE(dest, dc_value); 623 RECON_AND_STORE(dest, dc_value); 624 RECON_AND_STORE(dest, dc_value); 625 RECON_AND_STORE(dest, dc_value); 626 RECON_AND_STORE(dest, dc_value); 627 RECON_AND_STORE(dest, dc_value); 628} 629 630// perform 8x8 transpose 631static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 632 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 633 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 634 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 635 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 636 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 637 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 638 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 639 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 640 641 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 642 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 643 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 644 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 645 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 646 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 647 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 648 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 649 650 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 651 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 652 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 653 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 654 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 655 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 656 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 657 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 658} 659 660static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { 661 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 662 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 663 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 664 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 665 666 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 667 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 668 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 669 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 670 671 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); 672 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); 673 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); 674 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); 675} 676 677static void idct8_sse2(__m128i *in) { 678 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 679 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 680 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 681 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 682 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 683 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 684 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 685 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 686 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 687 688 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 689 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 690 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 691 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 692 693 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 694 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 695 in0, in1, in2, in3, in4, in5, in6, in7); 696 697 // 4-stage 1D idct8x8 698 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 699 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 700} 701 702static void iadst8_sse2(__m128i *in) { 703 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 704 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 705 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 706 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 707 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 708 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 709 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 710 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 711 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 712 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 713 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 714 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 715 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 716 const __m128i k__const_0 = _mm_set1_epi16(0); 717 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 718 719 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 720 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 721 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 722 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 723 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 724 725 // transpose 726 array_transpose_8x8(in, in); 727 728 // properly aligned for butterfly input 729 in0 = in[7]; 730 in1 = in[0]; 731 in2 = in[5]; 732 in3 = in[2]; 733 in4 = in[3]; 734 in5 = in[4]; 735 in6 = in[1]; 736 in7 = in[6]; 737 738 // column transformation 739 // stage 1 740 // interleave and multiply/add into 32-bit integer 741 s0 = _mm_unpacklo_epi16(in0, in1); 742 s1 = _mm_unpackhi_epi16(in0, in1); 743 s2 = _mm_unpacklo_epi16(in2, in3); 744 s3 = _mm_unpackhi_epi16(in2, in3); 745 s4 = _mm_unpacklo_epi16(in4, in5); 746 s5 = _mm_unpackhi_epi16(in4, in5); 747 s6 = _mm_unpacklo_epi16(in6, in7); 748 s7 = _mm_unpackhi_epi16(in6, in7); 749 750 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 751 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 752 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 753 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 754 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 755 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 756 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 757 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 758 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 759 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 760 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 761 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 762 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 763 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 764 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 765 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 766 767 // addition 768 w0 = _mm_add_epi32(u0, u8); 769 w1 = _mm_add_epi32(u1, u9); 770 w2 = _mm_add_epi32(u2, u10); 771 w3 = _mm_add_epi32(u3, u11); 772 w4 = _mm_add_epi32(u4, u12); 773 w5 = _mm_add_epi32(u5, u13); 774 w6 = _mm_add_epi32(u6, u14); 775 w7 = _mm_add_epi32(u7, u15); 776 w8 = _mm_sub_epi32(u0, u8); 777 w9 = _mm_sub_epi32(u1, u9); 778 w10 = _mm_sub_epi32(u2, u10); 779 w11 = _mm_sub_epi32(u3, u11); 780 w12 = _mm_sub_epi32(u4, u12); 781 w13 = _mm_sub_epi32(u5, u13); 782 w14 = _mm_sub_epi32(u6, u14); 783 w15 = _mm_sub_epi32(u7, u15); 784 785 // shift and rounding 786 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 787 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 788 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 789 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 790 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 791 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 792 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 793 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 794 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 795 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 796 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 797 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 798 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 799 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 800 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 801 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 802 803 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 804 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 805 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 806 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 807 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 808 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 809 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 810 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 811 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 812 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 813 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 814 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 815 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 816 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 817 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 818 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 819 820 // back to 16-bit and pack 8 integers into __m128i 821 in[0] = _mm_packs_epi32(u0, u1); 822 in[1] = _mm_packs_epi32(u2, u3); 823 in[2] = _mm_packs_epi32(u4, u5); 824 in[3] = _mm_packs_epi32(u6, u7); 825 in[4] = _mm_packs_epi32(u8, u9); 826 in[5] = _mm_packs_epi32(u10, u11); 827 in[6] = _mm_packs_epi32(u12, u13); 828 in[7] = _mm_packs_epi32(u14, u15); 829 830 // stage 2 831 s0 = _mm_add_epi16(in[0], in[2]); 832 s1 = _mm_add_epi16(in[1], in[3]); 833 s2 = _mm_sub_epi16(in[0], in[2]); 834 s3 = _mm_sub_epi16(in[1], in[3]); 835 u0 = _mm_unpacklo_epi16(in[4], in[5]); 836 u1 = _mm_unpackhi_epi16(in[4], in[5]); 837 u2 = _mm_unpacklo_epi16(in[6], in[7]); 838 u3 = _mm_unpackhi_epi16(in[6], in[7]); 839 840 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 841 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 842 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 843 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 844 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 845 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 846 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 847 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 848 849 w0 = _mm_add_epi32(v0, v4); 850 w1 = _mm_add_epi32(v1, v5); 851 w2 = _mm_add_epi32(v2, v6); 852 w3 = _mm_add_epi32(v3, v7); 853 w4 = _mm_sub_epi32(v0, v4); 854 w5 = _mm_sub_epi32(v1, v5); 855 w6 = _mm_sub_epi32(v2, v6); 856 w7 = _mm_sub_epi32(v3, v7); 857 858 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 859 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 860 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 861 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 862 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 863 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 864 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 865 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 866 867 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 868 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 869 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 870 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 871 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 872 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 873 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 874 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 875 876 // back to 16-bit intergers 877 s4 = _mm_packs_epi32(u0, u1); 878 s5 = _mm_packs_epi32(u2, u3); 879 s6 = _mm_packs_epi32(u4, u5); 880 s7 = _mm_packs_epi32(u6, u7); 881 882 // stage 3 883 u0 = _mm_unpacklo_epi16(s2, s3); 884 u1 = _mm_unpackhi_epi16(s2, s3); 885 u2 = _mm_unpacklo_epi16(s6, s7); 886 u3 = _mm_unpackhi_epi16(s6, s7); 887 888 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 889 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 890 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 891 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 892 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 893 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 894 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 895 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 896 897 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 898 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 899 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 900 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 901 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 902 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 903 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 904 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 905 906 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 907 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 908 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 909 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 910 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 911 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 912 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 913 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 914 915 s2 = _mm_packs_epi32(v0, v1); 916 s3 = _mm_packs_epi32(v2, v3); 917 s6 = _mm_packs_epi32(v4, v5); 918 s7 = _mm_packs_epi32(v6, v7); 919 920 in[0] = s0; 921 in[1] = _mm_sub_epi16(k__const_0, s4); 922 in[2] = s6; 923 in[3] = _mm_sub_epi16(k__const_0, s2); 924 in[4] = s3; 925 in[5] = _mm_sub_epi16(k__const_0, s7); 926 in[6] = s5; 927 in[7] = _mm_sub_epi16(k__const_0, s1); 928} 929 930 931void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 932 int tx_type) { 933 __m128i in[8]; 934 const __m128i zero = _mm_setzero_si128(); 935 const __m128i final_rounding = _mm_set1_epi16(1<<4); 936 937 // load input data 938 in[0] = _mm_load_si128((const __m128i *)input); 939 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 940 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 941 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 942 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 943 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 944 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 945 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 946 947 switch (tx_type) { 948 case 0: // DCT_DCT 949 idct8_sse2(in); 950 idct8_sse2(in); 951 break; 952 case 1: // ADST_DCT 953 idct8_sse2(in); 954 iadst8_sse2(in); 955 break; 956 case 2: // DCT_ADST 957 iadst8_sse2(in); 958 idct8_sse2(in); 959 break; 960 case 3: // ADST_ADST 961 iadst8_sse2(in); 962 iadst8_sse2(in); 963 break; 964 default: 965 assert(0); 966 break; 967 } 968 969 // Final rounding and shift 970 in[0] = _mm_adds_epi16(in[0], final_rounding); 971 in[1] = _mm_adds_epi16(in[1], final_rounding); 972 in[2] = _mm_adds_epi16(in[2], final_rounding); 973 in[3] = _mm_adds_epi16(in[3], final_rounding); 974 in[4] = _mm_adds_epi16(in[4], final_rounding); 975 in[5] = _mm_adds_epi16(in[5], final_rounding); 976 in[6] = _mm_adds_epi16(in[6], final_rounding); 977 in[7] = _mm_adds_epi16(in[7], final_rounding); 978 979 in[0] = _mm_srai_epi16(in[0], 5); 980 in[1] = _mm_srai_epi16(in[1], 5); 981 in[2] = _mm_srai_epi16(in[2], 5); 982 in[3] = _mm_srai_epi16(in[3], 5); 983 in[4] = _mm_srai_epi16(in[4], 5); 984 in[5] = _mm_srai_epi16(in[5], 5); 985 in[6] = _mm_srai_epi16(in[6], 5); 986 in[7] = _mm_srai_epi16(in[7], 5); 987 988 RECON_AND_STORE(dest, in[0]); 989 RECON_AND_STORE(dest, in[1]); 990 RECON_AND_STORE(dest, in[2]); 991 RECON_AND_STORE(dest, in[3]); 992 RECON_AND_STORE(dest, in[4]); 993 RECON_AND_STORE(dest, in[5]); 994 RECON_AND_STORE(dest, in[6]); 995 RECON_AND_STORE(dest, in[7]); 996} 997 998void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 999 const __m128i zero = _mm_setzero_si128(); 1000 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1001 const __m128i final_rounding = _mm_set1_epi16(1<<4); 1002 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1003 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1004 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1005 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 1006 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1007 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1008 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1009 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1010 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1011 1012 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1013 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 1014 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 1015 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1016 1017 // Rows. Load 4-row input data. 1018 in0 = _mm_load_si128((const __m128i *)input); 1019 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1020 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1021 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1022 1023 // 8x4 Transpose 1024 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 1025 // Stage1 1026 { //NOLINT 1027 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 1028 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 1029 1030 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 1031 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 1032 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 1033 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 1034 1035 tmp0 = _mm_add_epi32(tmp0, rounding); 1036 tmp2 = _mm_add_epi32(tmp2, rounding); 1037 tmp4 = _mm_add_epi32(tmp4, rounding); 1038 tmp6 = _mm_add_epi32(tmp6, rounding); 1039 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1040 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1041 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1042 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1043 1044 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 1045 stp1_5 = _mm_packs_epi32(tmp4, tmp6); 1046 } 1047 1048 // Stage2 1049 { //NOLINT 1050 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 1051 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 1052 1053 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1054 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1055 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1056 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1057 1058 tmp0 = _mm_add_epi32(tmp0, rounding); 1059 tmp2 = _mm_add_epi32(tmp2, rounding); 1060 tmp4 = _mm_add_epi32(tmp4, rounding); 1061 tmp6 = _mm_add_epi32(tmp6, rounding); 1062 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1063 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1064 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1065 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1066 1067 stp2_0 = _mm_packs_epi32(tmp0, tmp2); 1068 stp2_2 = _mm_packs_epi32(tmp6, tmp4); 1069 1070 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 1071 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 1072 1073 stp2_4 = tmp0; 1074 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 1075 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 1076 } 1077 1078 // Stage3 1079 { //NOLINT 1080 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1081 1082 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 1083 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 1084 1085 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 1086 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 1087 1088 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1089 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1090 1091 tmp0 = _mm_add_epi32(tmp0, rounding); 1092 tmp2 = _mm_add_epi32(tmp2, rounding); 1093 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1094 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1095 1096 stp1_5 = _mm_packs_epi32(tmp0, tmp2); 1097 } 1098 1099 // Stage4 1100 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 1101 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 1102 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 1103 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 1104 1105 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 1106 1107 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 1108 in0, in1, in2, in3, in4, in5, in6, in7); 1109 // Final rounding and shift 1110 in0 = _mm_adds_epi16(in0, final_rounding); 1111 in1 = _mm_adds_epi16(in1, final_rounding); 1112 in2 = _mm_adds_epi16(in2, final_rounding); 1113 in3 = _mm_adds_epi16(in3, final_rounding); 1114 in4 = _mm_adds_epi16(in4, final_rounding); 1115 in5 = _mm_adds_epi16(in5, final_rounding); 1116 in6 = _mm_adds_epi16(in6, final_rounding); 1117 in7 = _mm_adds_epi16(in7, final_rounding); 1118 1119 in0 = _mm_srai_epi16(in0, 5); 1120 in1 = _mm_srai_epi16(in1, 5); 1121 in2 = _mm_srai_epi16(in2, 5); 1122 in3 = _mm_srai_epi16(in3, 5); 1123 in4 = _mm_srai_epi16(in4, 5); 1124 in5 = _mm_srai_epi16(in5, 5); 1125 in6 = _mm_srai_epi16(in6, 5); 1126 in7 = _mm_srai_epi16(in7, 5); 1127 1128 RECON_AND_STORE(dest, in0); 1129 RECON_AND_STORE(dest, in1); 1130 RECON_AND_STORE(dest, in2); 1131 RECON_AND_STORE(dest, in3); 1132 RECON_AND_STORE(dest, in4); 1133 RECON_AND_STORE(dest, in5); 1134 RECON_AND_STORE(dest, in6); 1135 RECON_AND_STORE(dest, in7); 1136} 1137 1138#define IDCT16 \ 1139 /* Stage2 */ \ 1140 { \ 1141 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 1142 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 1143 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 1144 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 1145 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 1146 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 1147 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 1148 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 1149 \ 1150 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1151 stg2_0, stg2_1, stg2_2, stg2_3, \ 1152 stp2_8, stp2_15, stp2_9, stp2_14) \ 1153 \ 1154 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1155 stg2_4, stg2_5, stg2_6, stg2_7, \ 1156 stp2_10, stp2_13, stp2_11, stp2_12) \ 1157 } \ 1158 \ 1159 /* Stage3 */ \ 1160 { \ 1161 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 1162 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 1163 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 1164 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 1165 \ 1166 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1167 stg3_0, stg3_1, stg3_2, stg3_3, \ 1168 stp1_4, stp1_7, stp1_5, stp1_6) \ 1169 \ 1170 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1171 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1172 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1173 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1174 \ 1175 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1176 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1177 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1178 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1179 } \ 1180 \ 1181 /* Stage4 */ \ 1182 { \ 1183 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 1184 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 1185 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 1186 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 1187 \ 1188 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1189 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1190 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1191 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1192 \ 1193 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1194 stg4_0, stg4_1, stg4_2, stg4_3, \ 1195 stp2_0, stp2_1, stp2_2, stp2_3) \ 1196 \ 1197 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1198 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1199 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1200 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1201 \ 1202 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1203 stg4_4, stg4_5, stg4_6, stg4_7, \ 1204 stp2_9, stp2_14, stp2_10, stp2_13) \ 1205 } \ 1206 \ 1207 /* Stage5 */ \ 1208 { \ 1209 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1210 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1211 \ 1212 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1213 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1214 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1215 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1216 \ 1217 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1218 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1219 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1220 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1221 \ 1222 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1223 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1224 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1225 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1226 \ 1227 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1228 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1229 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1230 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1231 \ 1232 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1233 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1234 \ 1235 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1236 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1237 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1238 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1239 \ 1240 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1241 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1242 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1243 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1244 } \ 1245 \ 1246 /* Stage6 */ \ 1247 { \ 1248 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1249 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1250 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1251 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1252 \ 1253 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1254 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1255 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1256 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1257 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1258 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1259 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1260 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1261 \ 1262 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1263 stg6_0, stg4_0, stg6_0, stg4_0, \ 1264 stp2_10, stp2_13, stp2_11, stp2_12) \ 1265 } 1266 1267#define IDCT16_10 \ 1268 /* Stage2 */ \ 1269 { \ 1270 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 1271 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 1272 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 1273 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 1274 \ 1275 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ 1276 stg2_0, stg2_1, stg2_6, stg2_7, \ 1277 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ 1278 } \ 1279 \ 1280 /* Stage3 */ \ 1281 { \ 1282 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 1283 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 1284 \ 1285 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ 1286 stg3_0, stg3_1, \ 1287 stp2_4, stp2_7) \ 1288 \ 1289 stp1_9 = stp1_8_0; \ 1290 stp1_10 = stp1_11; \ 1291 \ 1292 stp1_13 = stp1_12_0; \ 1293 stp1_14 = stp1_15; \ 1294 } \ 1295 \ 1296 /* Stage4 */ \ 1297 { \ 1298 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 1299 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 1300 \ 1301 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1302 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1303 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1304 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1305 \ 1306 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ 1307 stg4_0, stg4_1, \ 1308 stp1_0, stp1_1) \ 1309 stp2_5 = stp2_4; \ 1310 stp2_6 = stp2_7; \ 1311 \ 1312 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1313 stg4_4, stg4_5, stg4_6, stg4_7, \ 1314 stp2_9, stp2_14, stp2_10, stp2_13) \ 1315 } \ 1316 \ 1317 /* Stage5 */ \ 1318 { \ 1319 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1320 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1321 \ 1322 stp1_2 = stp1_1; \ 1323 stp1_3 = stp1_0; \ 1324 \ 1325 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1326 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1327 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1328 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1329 \ 1330 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1331 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1332 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1333 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1334 \ 1335 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1336 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1337 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1338 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1339 \ 1340 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1341 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1342 \ 1343 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1344 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1345 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1346 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1347 \ 1348 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1349 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1350 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1351 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1352 } \ 1353 \ 1354 /* Stage6 */ \ 1355 { \ 1356 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1357 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1358 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1359 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1360 \ 1361 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1362 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1363 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1364 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1365 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1366 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1367 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1368 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1369 \ 1370 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1371 stg6_0, stg4_0, stg6_0, stg4_0, \ 1372 stp2_10, stp2_13, stp2_11, stp2_12) \ 1373 } 1374 1375void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1376 int stride) { 1377 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1378 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1379 const __m128i zero = _mm_setzero_si128(); 1380 1381 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1382 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1383 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1384 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1385 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1386 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1387 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1388 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1389 1390 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1391 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1392 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1393 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1394 1395 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1396 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1397 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1398 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1399 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1400 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1401 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1402 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1403 1404 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1405 1406 __m128i in[16], l[16], r[16], *curr1; 1407 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1408 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1409 stp1_8_0, stp1_12_0; 1410 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1411 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1412 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1413 int i; 1414 1415 curr1 = l; 1416 for (i = 0; i < 2; i++) { 1417 // 1-D idct 1418 1419 // Load input data. 1420 in[0] = _mm_load_si128((const __m128i *)input); 1421 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1422 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1423 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1424 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1425 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1426 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1427 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1428 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1429 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1430 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1431 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1432 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1433 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1434 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1435 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1436 1437 array_transpose_8x8(in, in); 1438 array_transpose_8x8(in+8, in+8); 1439 1440 IDCT16 1441 1442 // Stage7 1443 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1444 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1445 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1446 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1447 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1448 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1449 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1450 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1451 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1452 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1453 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1454 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1455 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1456 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1457 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1458 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1459 1460 curr1 = r; 1461 input += 128; 1462 } 1463 for (i = 0; i < 2; i++) { 1464 // 1-D idct 1465 array_transpose_8x8(l+i*8, in); 1466 array_transpose_8x8(r+i*8, in+8); 1467 1468 IDCT16 1469 1470 // 2-D 1471 in[0] = _mm_add_epi16(stp2_0, stp1_15); 1472 in[1] = _mm_add_epi16(stp2_1, stp1_14); 1473 in[2] = _mm_add_epi16(stp2_2, stp2_13); 1474 in[3] = _mm_add_epi16(stp2_3, stp2_12); 1475 in[4] = _mm_add_epi16(stp2_4, stp2_11); 1476 in[5] = _mm_add_epi16(stp2_5, stp2_10); 1477 in[6] = _mm_add_epi16(stp2_6, stp1_9); 1478 in[7] = _mm_add_epi16(stp2_7, stp1_8); 1479 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1480 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1481 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1482 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1483 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1484 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1485 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1486 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1487 1488 // Final rounding and shift 1489 in[0] = _mm_adds_epi16(in[0], final_rounding); 1490 in[1] = _mm_adds_epi16(in[1], final_rounding); 1491 in[2] = _mm_adds_epi16(in[2], final_rounding); 1492 in[3] = _mm_adds_epi16(in[3], final_rounding); 1493 in[4] = _mm_adds_epi16(in[4], final_rounding); 1494 in[5] = _mm_adds_epi16(in[5], final_rounding); 1495 in[6] = _mm_adds_epi16(in[6], final_rounding); 1496 in[7] = _mm_adds_epi16(in[7], final_rounding); 1497 in[8] = _mm_adds_epi16(in[8], final_rounding); 1498 in[9] = _mm_adds_epi16(in[9], final_rounding); 1499 in[10] = _mm_adds_epi16(in[10], final_rounding); 1500 in[11] = _mm_adds_epi16(in[11], final_rounding); 1501 in[12] = _mm_adds_epi16(in[12], final_rounding); 1502 in[13] = _mm_adds_epi16(in[13], final_rounding); 1503 in[14] = _mm_adds_epi16(in[14], final_rounding); 1504 in[15] = _mm_adds_epi16(in[15], final_rounding); 1505 1506 in[0] = _mm_srai_epi16(in[0], 6); 1507 in[1] = _mm_srai_epi16(in[1], 6); 1508 in[2] = _mm_srai_epi16(in[2], 6); 1509 in[3] = _mm_srai_epi16(in[3], 6); 1510 in[4] = _mm_srai_epi16(in[4], 6); 1511 in[5] = _mm_srai_epi16(in[5], 6); 1512 in[6] = _mm_srai_epi16(in[6], 6); 1513 in[7] = _mm_srai_epi16(in[7], 6); 1514 in[8] = _mm_srai_epi16(in[8], 6); 1515 in[9] = _mm_srai_epi16(in[9], 6); 1516 in[10] = _mm_srai_epi16(in[10], 6); 1517 in[11] = _mm_srai_epi16(in[11], 6); 1518 in[12] = _mm_srai_epi16(in[12], 6); 1519 in[13] = _mm_srai_epi16(in[13], 6); 1520 in[14] = _mm_srai_epi16(in[14], 6); 1521 in[15] = _mm_srai_epi16(in[15], 6); 1522 1523 RECON_AND_STORE(dest, in[0]); 1524 RECON_AND_STORE(dest, in[1]); 1525 RECON_AND_STORE(dest, in[2]); 1526 RECON_AND_STORE(dest, in[3]); 1527 RECON_AND_STORE(dest, in[4]); 1528 RECON_AND_STORE(dest, in[5]); 1529 RECON_AND_STORE(dest, in[6]); 1530 RECON_AND_STORE(dest, in[7]); 1531 RECON_AND_STORE(dest, in[8]); 1532 RECON_AND_STORE(dest, in[9]); 1533 RECON_AND_STORE(dest, in[10]); 1534 RECON_AND_STORE(dest, in[11]); 1535 RECON_AND_STORE(dest, in[12]); 1536 RECON_AND_STORE(dest, in[13]); 1537 RECON_AND_STORE(dest, in[14]); 1538 RECON_AND_STORE(dest, in[15]); 1539 1540 dest += 8 - (stride * 16); 1541 } 1542} 1543 1544void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1545 __m128i dc_value; 1546 const __m128i zero = _mm_setzero_si128(); 1547 int a, i; 1548 1549 a = dct_const_round_shift(input[0] * cospi_16_64); 1550 a = dct_const_round_shift(a * cospi_16_64); 1551 a = ROUND_POWER_OF_TWO(a, 6); 1552 1553 dc_value = _mm_set1_epi16(a); 1554 1555 for (i = 0; i < 2; ++i) { 1556 RECON_AND_STORE(dest, dc_value); 1557 RECON_AND_STORE(dest, dc_value); 1558 RECON_AND_STORE(dest, dc_value); 1559 RECON_AND_STORE(dest, dc_value); 1560 RECON_AND_STORE(dest, dc_value); 1561 RECON_AND_STORE(dest, dc_value); 1562 RECON_AND_STORE(dest, dc_value); 1563 RECON_AND_STORE(dest, dc_value); 1564 RECON_AND_STORE(dest, dc_value); 1565 RECON_AND_STORE(dest, dc_value); 1566 RECON_AND_STORE(dest, dc_value); 1567 RECON_AND_STORE(dest, dc_value); 1568 RECON_AND_STORE(dest, dc_value); 1569 RECON_AND_STORE(dest, dc_value); 1570 RECON_AND_STORE(dest, dc_value); 1571 RECON_AND_STORE(dest, dc_value); 1572 dest += 8 - (stride * 16); 1573 } 1574} 1575 1576static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1577 __m128i tbuf[8]; 1578 array_transpose_8x8(res0, res0); 1579 array_transpose_8x8(res1, tbuf); 1580 array_transpose_8x8(res0 + 8, res1); 1581 array_transpose_8x8(res1 + 8, res1 + 8); 1582 1583 res0[8] = tbuf[0]; 1584 res0[9] = tbuf[1]; 1585 res0[10] = tbuf[2]; 1586 res0[11] = tbuf[3]; 1587 res0[12] = tbuf[4]; 1588 res0[13] = tbuf[5]; 1589 res0[14] = tbuf[6]; 1590 res0[15] = tbuf[7]; 1591} 1592 1593static void iadst16_8col(__m128i *in) { 1594 // perform 16x16 1-D ADST for 8 columns 1595 __m128i s[16], x[16], u[32], v[32]; 1596 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1597 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1598 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1599 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1600 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1601 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1602 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1603 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1604 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1605 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1606 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1607 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1608 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1609 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1610 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1611 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1612 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1613 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1614 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1615 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1616 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1617 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1618 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1619 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1620 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1621 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1622 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1623 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1624 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1625 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1626 const __m128i kZero = _mm_set1_epi16(0); 1627 1628 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1629 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1630 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1631 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1632 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1633 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1634 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1635 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1636 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1637 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1638 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1639 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1640 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1641 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1642 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1643 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1644 1645 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1646 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1647 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1648 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1649 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1650 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1651 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1652 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1653 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1654 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1655 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1656 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1657 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1658 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1659 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1660 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1661 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1662 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1663 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1664 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1665 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1666 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1667 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1668 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1669 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1670 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1671 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1672 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1673 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1674 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1675 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1676 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1677 1678 u[0] = _mm_add_epi32(v[0], v[16]); 1679 u[1] = _mm_add_epi32(v[1], v[17]); 1680 u[2] = _mm_add_epi32(v[2], v[18]); 1681 u[3] = _mm_add_epi32(v[3], v[19]); 1682 u[4] = _mm_add_epi32(v[4], v[20]); 1683 u[5] = _mm_add_epi32(v[5], v[21]); 1684 u[6] = _mm_add_epi32(v[6], v[22]); 1685 u[7] = _mm_add_epi32(v[7], v[23]); 1686 u[8] = _mm_add_epi32(v[8], v[24]); 1687 u[9] = _mm_add_epi32(v[9], v[25]); 1688 u[10] = _mm_add_epi32(v[10], v[26]); 1689 u[11] = _mm_add_epi32(v[11], v[27]); 1690 u[12] = _mm_add_epi32(v[12], v[28]); 1691 u[13] = _mm_add_epi32(v[13], v[29]); 1692 u[14] = _mm_add_epi32(v[14], v[30]); 1693 u[15] = _mm_add_epi32(v[15], v[31]); 1694 u[16] = _mm_sub_epi32(v[0], v[16]); 1695 u[17] = _mm_sub_epi32(v[1], v[17]); 1696 u[18] = _mm_sub_epi32(v[2], v[18]); 1697 u[19] = _mm_sub_epi32(v[3], v[19]); 1698 u[20] = _mm_sub_epi32(v[4], v[20]); 1699 u[21] = _mm_sub_epi32(v[5], v[21]); 1700 u[22] = _mm_sub_epi32(v[6], v[22]); 1701 u[23] = _mm_sub_epi32(v[7], v[23]); 1702 u[24] = _mm_sub_epi32(v[8], v[24]); 1703 u[25] = _mm_sub_epi32(v[9], v[25]); 1704 u[26] = _mm_sub_epi32(v[10], v[26]); 1705 u[27] = _mm_sub_epi32(v[11], v[27]); 1706 u[28] = _mm_sub_epi32(v[12], v[28]); 1707 u[29] = _mm_sub_epi32(v[13], v[29]); 1708 u[30] = _mm_sub_epi32(v[14], v[30]); 1709 u[31] = _mm_sub_epi32(v[15], v[31]); 1710 1711 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1712 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1713 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1714 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1715 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1716 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1717 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1718 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1719 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1720 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1721 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1722 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1723 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1724 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1725 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1726 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1727 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1728 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1729 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1730 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1731 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1732 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1733 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1734 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1735 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1736 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1737 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1738 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1739 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1740 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1741 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1742 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1743 1744 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1745 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1746 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1747 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1748 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1749 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1750 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1751 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1752 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1753 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1754 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1755 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1756 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1757 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1758 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1759 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1760 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1761 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1762 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1763 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1764 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1765 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1766 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1767 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1768 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1769 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1770 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1771 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1772 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1773 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1774 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1775 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1776 1777 s[0] = _mm_packs_epi32(u[0], u[1]); 1778 s[1] = _mm_packs_epi32(u[2], u[3]); 1779 s[2] = _mm_packs_epi32(u[4], u[5]); 1780 s[3] = _mm_packs_epi32(u[6], u[7]); 1781 s[4] = _mm_packs_epi32(u[8], u[9]); 1782 s[5] = _mm_packs_epi32(u[10], u[11]); 1783 s[6] = _mm_packs_epi32(u[12], u[13]); 1784 s[7] = _mm_packs_epi32(u[14], u[15]); 1785 s[8] = _mm_packs_epi32(u[16], u[17]); 1786 s[9] = _mm_packs_epi32(u[18], u[19]); 1787 s[10] = _mm_packs_epi32(u[20], u[21]); 1788 s[11] = _mm_packs_epi32(u[22], u[23]); 1789 s[12] = _mm_packs_epi32(u[24], u[25]); 1790 s[13] = _mm_packs_epi32(u[26], u[27]); 1791 s[14] = _mm_packs_epi32(u[28], u[29]); 1792 s[15] = _mm_packs_epi32(u[30], u[31]); 1793 1794 // stage 2 1795 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1796 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1797 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1798 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1799 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1800 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1801 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1802 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1803 1804 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1805 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1806 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1807 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1808 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1809 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1810 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1811 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1812 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1813 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1814 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1815 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1816 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1817 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1818 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1819 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1820 1821 u[0] = _mm_add_epi32(v[0], v[8]); 1822 u[1] = _mm_add_epi32(v[1], v[9]); 1823 u[2] = _mm_add_epi32(v[2], v[10]); 1824 u[3] = _mm_add_epi32(v[3], v[11]); 1825 u[4] = _mm_add_epi32(v[4], v[12]); 1826 u[5] = _mm_add_epi32(v[5], v[13]); 1827 u[6] = _mm_add_epi32(v[6], v[14]); 1828 u[7] = _mm_add_epi32(v[7], v[15]); 1829 u[8] = _mm_sub_epi32(v[0], v[8]); 1830 u[9] = _mm_sub_epi32(v[1], v[9]); 1831 u[10] = _mm_sub_epi32(v[2], v[10]); 1832 u[11] = _mm_sub_epi32(v[3], v[11]); 1833 u[12] = _mm_sub_epi32(v[4], v[12]); 1834 u[13] = _mm_sub_epi32(v[5], v[13]); 1835 u[14] = _mm_sub_epi32(v[6], v[14]); 1836 u[15] = _mm_sub_epi32(v[7], v[15]); 1837 1838 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1839 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1840 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1841 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1842 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1843 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1844 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1845 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1846 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1847 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1848 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1849 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1850 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1851 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1852 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1853 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1854 1855 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1856 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1857 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1858 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1859 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1860 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1861 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1862 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1863 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1864 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1865 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1866 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1867 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1868 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1869 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1870 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1871 1872 x[0] = _mm_add_epi16(s[0], s[4]); 1873 x[1] = _mm_add_epi16(s[1], s[5]); 1874 x[2] = _mm_add_epi16(s[2], s[6]); 1875 x[3] = _mm_add_epi16(s[3], s[7]); 1876 x[4] = _mm_sub_epi16(s[0], s[4]); 1877 x[5] = _mm_sub_epi16(s[1], s[5]); 1878 x[6] = _mm_sub_epi16(s[2], s[6]); 1879 x[7] = _mm_sub_epi16(s[3], s[7]); 1880 x[8] = _mm_packs_epi32(u[0], u[1]); 1881 x[9] = _mm_packs_epi32(u[2], u[3]); 1882 x[10] = _mm_packs_epi32(u[4], u[5]); 1883 x[11] = _mm_packs_epi32(u[6], u[7]); 1884 x[12] = _mm_packs_epi32(u[8], u[9]); 1885 x[13] = _mm_packs_epi32(u[10], u[11]); 1886 x[14] = _mm_packs_epi32(u[12], u[13]); 1887 x[15] = _mm_packs_epi32(u[14], u[15]); 1888 1889 // stage 3 1890 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1891 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1892 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1893 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1894 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1895 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1896 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1897 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1898 1899 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1900 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1901 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1902 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1903 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1904 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1905 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1906 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1907 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1908 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1909 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1910 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1911 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1912 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1913 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1914 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1915 1916 u[0] = _mm_add_epi32(v[0], v[4]); 1917 u[1] = _mm_add_epi32(v[1], v[5]); 1918 u[2] = _mm_add_epi32(v[2], v[6]); 1919 u[3] = _mm_add_epi32(v[3], v[7]); 1920 u[4] = _mm_sub_epi32(v[0], v[4]); 1921 u[5] = _mm_sub_epi32(v[1], v[5]); 1922 u[6] = _mm_sub_epi32(v[2], v[6]); 1923 u[7] = _mm_sub_epi32(v[3], v[7]); 1924 u[8] = _mm_add_epi32(v[8], v[12]); 1925 u[9] = _mm_add_epi32(v[9], v[13]); 1926 u[10] = _mm_add_epi32(v[10], v[14]); 1927 u[11] = _mm_add_epi32(v[11], v[15]); 1928 u[12] = _mm_sub_epi32(v[8], v[12]); 1929 u[13] = _mm_sub_epi32(v[9], v[13]); 1930 u[14] = _mm_sub_epi32(v[10], v[14]); 1931 u[15] = _mm_sub_epi32(v[11], v[15]); 1932 1933 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1934 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1935 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1936 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1937 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1938 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1939 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1940 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1941 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1942 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1943 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1944 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1945 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1946 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1947 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1948 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1949 1950 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1951 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1952 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1953 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1954 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1955 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1956 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1957 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1958 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1959 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1960 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1961 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1962 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1963 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1964 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1965 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1966 1967 s[0] = _mm_add_epi16(x[0], x[2]); 1968 s[1] = _mm_add_epi16(x[1], x[3]); 1969 s[2] = _mm_sub_epi16(x[0], x[2]); 1970 s[3] = _mm_sub_epi16(x[1], x[3]); 1971 s[4] = _mm_packs_epi32(v[0], v[1]); 1972 s[5] = _mm_packs_epi32(v[2], v[3]); 1973 s[6] = _mm_packs_epi32(v[4], v[5]); 1974 s[7] = _mm_packs_epi32(v[6], v[7]); 1975 s[8] = _mm_add_epi16(x[8], x[10]); 1976 s[9] = _mm_add_epi16(x[9], x[11]); 1977 s[10] = _mm_sub_epi16(x[8], x[10]); 1978 s[11] = _mm_sub_epi16(x[9], x[11]); 1979 s[12] = _mm_packs_epi32(v[8], v[9]); 1980 s[13] = _mm_packs_epi32(v[10], v[11]); 1981 s[14] = _mm_packs_epi32(v[12], v[13]); 1982 s[15] = _mm_packs_epi32(v[14], v[15]); 1983 1984 // stage 4 1985 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1986 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1987 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1988 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1989 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1990 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1991 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1992 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1993 1994 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1995 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1996 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1997 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1998 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1999 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2000 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2001 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2002 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 2003 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 2004 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 2005 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 2006 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 2007 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 2008 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 2009 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 2010 2011 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2012 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2013 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2014 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2015 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2016 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2017 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2018 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2019 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2020 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2021 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2022 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2023 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2024 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2025 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2026 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2027 2028 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2029 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2030 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2031 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2032 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2033 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2034 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2035 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2036 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2037 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2038 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2039 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2040 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2041 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2042 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2043 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2044 2045 in[0] = s[0]; 2046 in[1] = _mm_sub_epi16(kZero, s[8]); 2047 in[2] = s[12]; 2048 in[3] = _mm_sub_epi16(kZero, s[4]); 2049 in[4] = _mm_packs_epi32(v[4], v[5]); 2050 in[5] = _mm_packs_epi32(v[12], v[13]); 2051 in[6] = _mm_packs_epi32(v[8], v[9]); 2052 in[7] = _mm_packs_epi32(v[0], v[1]); 2053 in[8] = _mm_packs_epi32(v[2], v[3]); 2054 in[9] = _mm_packs_epi32(v[10], v[11]); 2055 in[10] = _mm_packs_epi32(v[14], v[15]); 2056 in[11] = _mm_packs_epi32(v[6], v[7]); 2057 in[12] = s[5]; 2058 in[13] = _mm_sub_epi16(kZero, s[13]); 2059 in[14] = s[9]; 2060 in[15] = _mm_sub_epi16(kZero, s[1]); 2061} 2062 2063static void idct16_8col(__m128i *in) { 2064 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2065 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 2066 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2067 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 2068 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2069 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 2070 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2071 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 2072 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2073 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 2074 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2075 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 2076 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 2077 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2078 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2079 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 2080 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2081 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 2082 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2083 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2084 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 2085 __m128i v[16], u[16], s[16], t[16]; 2086 2087 // stage 1 2088 s[0] = in[0]; 2089 s[1] = in[8]; 2090 s[2] = in[4]; 2091 s[3] = in[12]; 2092 s[4] = in[2]; 2093 s[5] = in[10]; 2094 s[6] = in[6]; 2095 s[7] = in[14]; 2096 s[8] = in[1]; 2097 s[9] = in[9]; 2098 s[10] = in[5]; 2099 s[11] = in[13]; 2100 s[12] = in[3]; 2101 s[13] = in[11]; 2102 s[14] = in[7]; 2103 s[15] = in[15]; 2104 2105 // stage 2 2106 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 2107 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 2108 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 2109 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 2110 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 2111 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 2112 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 2113 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 2114 2115 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 2116 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 2117 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 2118 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 2119 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 2120 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 2121 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 2122 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 2123 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 2124 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 2125 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 2126 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 2127 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 2128 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 2129 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 2130 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 2131 2132 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2133 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2134 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2135 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2136 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2137 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2138 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2139 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2140 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2141 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2142 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2143 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2144 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2145 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2146 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2147 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2148 2149 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2150 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2151 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2152 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2153 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2154 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2155 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2156 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2157 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2158 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2159 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2160 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2161 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2162 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2163 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2164 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2165 2166 s[8] = _mm_packs_epi32(u[0], u[1]); 2167 s[15] = _mm_packs_epi32(u[2], u[3]); 2168 s[9] = _mm_packs_epi32(u[4], u[5]); 2169 s[14] = _mm_packs_epi32(u[6], u[7]); 2170 s[10] = _mm_packs_epi32(u[8], u[9]); 2171 s[13] = _mm_packs_epi32(u[10], u[11]); 2172 s[11] = _mm_packs_epi32(u[12], u[13]); 2173 s[12] = _mm_packs_epi32(u[14], u[15]); 2174 2175 // stage 3 2176 t[0] = s[0]; 2177 t[1] = s[1]; 2178 t[2] = s[2]; 2179 t[3] = s[3]; 2180 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 2181 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 2182 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 2183 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 2184 2185 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2186 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2187 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2188 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2189 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2190 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2191 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2192 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2193 2194 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2195 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2196 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2197 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2198 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2199 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2200 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2201 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2202 2203 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2204 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2205 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2206 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2207 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2208 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2209 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2210 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2211 2212 t[4] = _mm_packs_epi32(u[0], u[1]); 2213 t[7] = _mm_packs_epi32(u[2], u[3]); 2214 t[5] = _mm_packs_epi32(u[4], u[5]); 2215 t[6] = _mm_packs_epi32(u[6], u[7]); 2216 t[8] = _mm_add_epi16(s[8], s[9]); 2217 t[9] = _mm_sub_epi16(s[8], s[9]); 2218 t[10] = _mm_sub_epi16(s[11], s[10]); 2219 t[11] = _mm_add_epi16(s[10], s[11]); 2220 t[12] = _mm_add_epi16(s[12], s[13]); 2221 t[13] = _mm_sub_epi16(s[12], s[13]); 2222 t[14] = _mm_sub_epi16(s[15], s[14]); 2223 t[15] = _mm_add_epi16(s[14], s[15]); 2224 2225 // stage 4 2226 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 2227 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 2228 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 2229 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 2230 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 2231 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 2232 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 2233 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 2234 2235 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2236 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2237 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2238 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2239 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 2240 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 2241 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2242 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2243 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 2244 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 2245 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 2246 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 2247 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 2248 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 2249 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 2250 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 2251 2252 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2253 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2254 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2255 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2256 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2257 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2258 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2259 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2260 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2261 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2262 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2263 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2264 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2265 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2266 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2267 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2268 2269 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2270 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2271 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2272 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2273 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2274 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2275 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2276 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2277 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2278 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2279 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2280 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2281 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2282 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2283 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2284 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2285 2286 s[0] = _mm_packs_epi32(u[0], u[1]); 2287 s[1] = _mm_packs_epi32(u[2], u[3]); 2288 s[2] = _mm_packs_epi32(u[4], u[5]); 2289 s[3] = _mm_packs_epi32(u[6], u[7]); 2290 s[4] = _mm_add_epi16(t[4], t[5]); 2291 s[5] = _mm_sub_epi16(t[4], t[5]); 2292 s[6] = _mm_sub_epi16(t[7], t[6]); 2293 s[7] = _mm_add_epi16(t[6], t[7]); 2294 s[8] = t[8]; 2295 s[15] = t[15]; 2296 s[9] = _mm_packs_epi32(u[8], u[9]); 2297 s[14] = _mm_packs_epi32(u[10], u[11]); 2298 s[10] = _mm_packs_epi32(u[12], u[13]); 2299 s[13] = _mm_packs_epi32(u[14], u[15]); 2300 s[11] = t[11]; 2301 s[12] = t[12]; 2302 2303 // stage 5 2304 t[0] = _mm_add_epi16(s[0], s[3]); 2305 t[1] = _mm_add_epi16(s[1], s[2]); 2306 t[2] = _mm_sub_epi16(s[1], s[2]); 2307 t[3] = _mm_sub_epi16(s[0], s[3]); 2308 t[4] = s[4]; 2309 t[7] = s[7]; 2310 2311 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2312 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2313 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2314 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2315 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2316 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2317 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2318 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2319 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2320 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2321 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2322 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2323 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2324 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2325 t[5] = _mm_packs_epi32(u[0], u[1]); 2326 t[6] = _mm_packs_epi32(u[2], u[3]); 2327 2328 t[8] = _mm_add_epi16(s[8], s[11]); 2329 t[9] = _mm_add_epi16(s[9], s[10]); 2330 t[10] = _mm_sub_epi16(s[9], s[10]); 2331 t[11] = _mm_sub_epi16(s[8], s[11]); 2332 t[12] = _mm_sub_epi16(s[15], s[12]); 2333 t[13] = _mm_sub_epi16(s[14], s[13]); 2334 t[14] = _mm_add_epi16(s[13], s[14]); 2335 t[15] = _mm_add_epi16(s[12], s[15]); 2336 2337 // stage 6 2338 s[0] = _mm_add_epi16(t[0], t[7]); 2339 s[1] = _mm_add_epi16(t[1], t[6]); 2340 s[2] = _mm_add_epi16(t[2], t[5]); 2341 s[3] = _mm_add_epi16(t[3], t[4]); 2342 s[4] = _mm_sub_epi16(t[3], t[4]); 2343 s[5] = _mm_sub_epi16(t[2], t[5]); 2344 s[6] = _mm_sub_epi16(t[1], t[6]); 2345 s[7] = _mm_sub_epi16(t[0], t[7]); 2346 s[8] = t[8]; 2347 s[9] = t[9]; 2348 2349 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2350 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2351 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2352 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2353 2354 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2355 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2356 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2357 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2358 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2359 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2360 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2361 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2362 2363 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2364 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2365 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2366 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2367 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2368 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2369 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2370 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2371 2372 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2373 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2374 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2375 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2376 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2377 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2378 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2379 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2380 2381 s[10] = _mm_packs_epi32(u[0], u[1]); 2382 s[13] = _mm_packs_epi32(u[2], u[3]); 2383 s[11] = _mm_packs_epi32(u[4], u[5]); 2384 s[12] = _mm_packs_epi32(u[6], u[7]); 2385 s[14] = t[14]; 2386 s[15] = t[15]; 2387 2388 // stage 7 2389 in[0] = _mm_add_epi16(s[0], s[15]); 2390 in[1] = _mm_add_epi16(s[1], s[14]); 2391 in[2] = _mm_add_epi16(s[2], s[13]); 2392 in[3] = _mm_add_epi16(s[3], s[12]); 2393 in[4] = _mm_add_epi16(s[4], s[11]); 2394 in[5] = _mm_add_epi16(s[5], s[10]); 2395 in[6] = _mm_add_epi16(s[6], s[9]); 2396 in[7] = _mm_add_epi16(s[7], s[8]); 2397 in[8] = _mm_sub_epi16(s[7], s[8]); 2398 in[9] = _mm_sub_epi16(s[6], s[9]); 2399 in[10] = _mm_sub_epi16(s[5], s[10]); 2400 in[11] = _mm_sub_epi16(s[4], s[11]); 2401 in[12] = _mm_sub_epi16(s[3], s[12]); 2402 in[13] = _mm_sub_epi16(s[2], s[13]); 2403 in[14] = _mm_sub_epi16(s[1], s[14]); 2404 in[15] = _mm_sub_epi16(s[0], s[15]); 2405} 2406 2407static void idct16_sse2(__m128i *in0, __m128i *in1) { 2408 array_transpose_16x16(in0, in1); 2409 idct16_8col(in0); 2410 idct16_8col(in1); 2411} 2412 2413static void iadst16_sse2(__m128i *in0, __m128i *in1) { 2414 array_transpose_16x16(in0, in1); 2415 iadst16_8col(in0); 2416 iadst16_8col(in1); 2417} 2418 2419static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 2420 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 2421 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 2422 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 2423 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 2424 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 2425 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 2426 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 2427 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 2428 2429 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 2430 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 2431 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 2432 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 2433 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 2434 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 2435 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 2436 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 2437} 2438 2439static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 2440 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2441 const __m128i zero = _mm_setzero_si128(); 2442 // Final rounding and shift 2443 in[0] = _mm_adds_epi16(in[0], final_rounding); 2444 in[1] = _mm_adds_epi16(in[1], final_rounding); 2445 in[2] = _mm_adds_epi16(in[2], final_rounding); 2446 in[3] = _mm_adds_epi16(in[3], final_rounding); 2447 in[4] = _mm_adds_epi16(in[4], final_rounding); 2448 in[5] = _mm_adds_epi16(in[5], final_rounding); 2449 in[6] = _mm_adds_epi16(in[6], final_rounding); 2450 in[7] = _mm_adds_epi16(in[7], final_rounding); 2451 in[8] = _mm_adds_epi16(in[8], final_rounding); 2452 in[9] = _mm_adds_epi16(in[9], final_rounding); 2453 in[10] = _mm_adds_epi16(in[10], final_rounding); 2454 in[11] = _mm_adds_epi16(in[11], final_rounding); 2455 in[12] = _mm_adds_epi16(in[12], final_rounding); 2456 in[13] = _mm_adds_epi16(in[13], final_rounding); 2457 in[14] = _mm_adds_epi16(in[14], final_rounding); 2458 in[15] = _mm_adds_epi16(in[15], final_rounding); 2459 2460 in[0] = _mm_srai_epi16(in[0], 6); 2461 in[1] = _mm_srai_epi16(in[1], 6); 2462 in[2] = _mm_srai_epi16(in[2], 6); 2463 in[3] = _mm_srai_epi16(in[3], 6); 2464 in[4] = _mm_srai_epi16(in[4], 6); 2465 in[5] = _mm_srai_epi16(in[5], 6); 2466 in[6] = _mm_srai_epi16(in[6], 6); 2467 in[7] = _mm_srai_epi16(in[7], 6); 2468 in[8] = _mm_srai_epi16(in[8], 6); 2469 in[9] = _mm_srai_epi16(in[9], 6); 2470 in[10] = _mm_srai_epi16(in[10], 6); 2471 in[11] = _mm_srai_epi16(in[11], 6); 2472 in[12] = _mm_srai_epi16(in[12], 6); 2473 in[13] = _mm_srai_epi16(in[13], 6); 2474 in[14] = _mm_srai_epi16(in[14], 6); 2475 in[15] = _mm_srai_epi16(in[15], 6); 2476 2477 RECON_AND_STORE(dest, in[0]); 2478 RECON_AND_STORE(dest, in[1]); 2479 RECON_AND_STORE(dest, in[2]); 2480 RECON_AND_STORE(dest, in[3]); 2481 RECON_AND_STORE(dest, in[4]); 2482 RECON_AND_STORE(dest, in[5]); 2483 RECON_AND_STORE(dest, in[6]); 2484 RECON_AND_STORE(dest, in[7]); 2485 RECON_AND_STORE(dest, in[8]); 2486 RECON_AND_STORE(dest, in[9]); 2487 RECON_AND_STORE(dest, in[10]); 2488 RECON_AND_STORE(dest, in[11]); 2489 RECON_AND_STORE(dest, in[12]); 2490 RECON_AND_STORE(dest, in[13]); 2491 RECON_AND_STORE(dest, in[14]); 2492 RECON_AND_STORE(dest, in[15]); 2493} 2494 2495void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 2496 int tx_type) { 2497 __m128i in0[16], in1[16]; 2498 2499 load_buffer_8x16(input, in0); 2500 input += 8; 2501 load_buffer_8x16(input, in1); 2502 2503 switch (tx_type) { 2504 case 0: // DCT_DCT 2505 idct16_sse2(in0, in1); 2506 idct16_sse2(in0, in1); 2507 break; 2508 case 1: // ADST_DCT 2509 idct16_sse2(in0, in1); 2510 iadst16_sse2(in0, in1); 2511 break; 2512 case 2: // DCT_ADST 2513 iadst16_sse2(in0, in1); 2514 idct16_sse2(in0, in1); 2515 break; 2516 case 3: // ADST_ADST 2517 iadst16_sse2(in0, in1); 2518 iadst16_sse2(in0, in1); 2519 break; 2520 default: 2521 assert(0); 2522 break; 2523 } 2524 2525 write_buffer_8x16(dest, in0, stride); 2526 dest += 8; 2527 write_buffer_8x16(dest, in1, stride); 2528} 2529 2530void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2531 int stride) { 2532 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2533 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2534 const __m128i zero = _mm_setzero_si128(); 2535 2536 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2537 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2538 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2539 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2540 2541 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2542 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2543 2544 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2545 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2546 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2547 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2548 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2549 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2550 2551 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2552 __m128i in[16], l[16]; 2553 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, 2554 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2555 stp1_8_0, stp1_12_0; 2556 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2557 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 2558 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2559 int i; 2560 // First 1-D inverse DCT 2561 // Load input data. 2562 in[0] = _mm_load_si128((const __m128i *)input); 2563 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2564 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2565 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2566 2567 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2568 2569 // Stage2 2570 { 2571 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2572 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2573 2574 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2575 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2576 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2577 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2578 2579 tmp0 = _mm_add_epi32(tmp0, rounding); 2580 tmp2 = _mm_add_epi32(tmp2, rounding); 2581 tmp5 = _mm_add_epi32(tmp5, rounding); 2582 tmp7 = _mm_add_epi32(tmp7, rounding); 2583 2584 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2585 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2586 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2587 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2588 2589 stp2_8 = _mm_packs_epi32(tmp0, tmp2); 2590 stp2_11 = _mm_packs_epi32(tmp5, tmp7); 2591 } 2592 2593 // Stage3 2594 { 2595 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 2596 2597 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2598 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2599 2600 tmp0 = _mm_add_epi32(tmp0, rounding); 2601 tmp2 = _mm_add_epi32(tmp2, rounding); 2602 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2603 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2604 2605 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2606 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2607 2608 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2609 } 2610 2611 // Stage4 2612 { 2613 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2614 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2615 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2616 2617 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2618 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2619 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2620 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2621 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2622 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2623 2624 tmp0 = _mm_add_epi32(tmp0, rounding); 2625 tmp2 = _mm_add_epi32(tmp2, rounding); 2626 tmp1 = _mm_add_epi32(tmp1, rounding); 2627 tmp3 = _mm_add_epi32(tmp3, rounding); 2628 tmp5 = _mm_add_epi32(tmp5, rounding); 2629 tmp7 = _mm_add_epi32(tmp7, rounding); 2630 2631 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2632 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2633 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2634 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2635 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2636 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2637 2638 stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2639 stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2640 stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2641 stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2642 2643 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2644 } 2645 2646 // Stage5 and Stage6 2647 { 2648 tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2649 tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2650 tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2651 tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2652 2653 stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2654 stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2655 stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2656 stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2657 2658 stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2659 stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2660 stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2661 stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2662 } 2663 2664 // Stage6 2665 { 2666 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2667 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2668 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2669 2670 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2671 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2672 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2673 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2674 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2675 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2676 2677 tmp1 = _mm_add_epi32(tmp1, rounding); 2678 tmp3 = _mm_add_epi32(tmp3, rounding); 2679 tmp0 = _mm_add_epi32(tmp0, rounding); 2680 tmp2 = _mm_add_epi32(tmp2, rounding); 2681 tmp4 = _mm_add_epi32(tmp4, rounding); 2682 tmp6 = _mm_add_epi32(tmp6, rounding); 2683 2684 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2685 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2686 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2687 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2688 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2689 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2690 2691 stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2692 2693 stp2_10 = _mm_packs_epi32(tmp0, zero); 2694 stp2_13 = _mm_packs_epi32(tmp2, zero); 2695 stp2_11 = _mm_packs_epi32(tmp4, zero); 2696 stp2_12 = _mm_packs_epi32(tmp6, zero); 2697 2698 tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2699 tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2700 tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2701 tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2702 2703 stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2704 stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2705 stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2706 stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2707 stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2708 stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2709 stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2710 stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2711 } 2712 2713 // Stage7. Left 8x16 only. 2714 l[0] = _mm_add_epi16(stp2_0, stp1_15); 2715 l[1] = _mm_add_epi16(stp2_1, stp1_14); 2716 l[2] = _mm_add_epi16(stp2_2, stp2_13); 2717 l[3] = _mm_add_epi16(stp2_3, stp2_12); 2718 l[4] = _mm_add_epi16(stp2_4, stp2_11); 2719 l[5] = _mm_add_epi16(stp2_5, stp2_10); 2720 l[6] = _mm_add_epi16(stp2_6, stp1_9); 2721 l[7] = _mm_add_epi16(stp2_7, stp1_8); 2722 l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2723 l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2724 l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2725 l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2726 l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2727 l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2728 l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2729 l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2730 2731 // Second 1-D inverse transform, performed per 8x16 block 2732 for (i = 0; i < 2; i++) { 2733 array_transpose_4X8(l + 8*i, in); 2734 2735 IDCT16_10 2736 2737 // Stage7 2738 in[0] = _mm_add_epi16(stp2_0, stp1_15); 2739 in[1] = _mm_add_epi16(stp2_1, stp1_14); 2740 in[2] = _mm_add_epi16(stp2_2, stp2_13); 2741 in[3] = _mm_add_epi16(stp2_3, stp2_12); 2742 in[4] = _mm_add_epi16(stp2_4, stp2_11); 2743 in[5] = _mm_add_epi16(stp2_5, stp2_10); 2744 in[6] = _mm_add_epi16(stp2_6, stp1_9); 2745 in[7] = _mm_add_epi16(stp2_7, stp1_8); 2746 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2747 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2748 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2749 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2750 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2751 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2752 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2753 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2754 2755 // Final rounding and shift 2756 in[0] = _mm_adds_epi16(in[0], final_rounding); 2757 in[1] = _mm_adds_epi16(in[1], final_rounding); 2758 in[2] = _mm_adds_epi16(in[2], final_rounding); 2759 in[3] = _mm_adds_epi16(in[3], final_rounding); 2760 in[4] = _mm_adds_epi16(in[4], final_rounding); 2761 in[5] = _mm_adds_epi16(in[5], final_rounding); 2762 in[6] = _mm_adds_epi16(in[6], final_rounding); 2763 in[7] = _mm_adds_epi16(in[7], final_rounding); 2764 in[8] = _mm_adds_epi16(in[8], final_rounding); 2765 in[9] = _mm_adds_epi16(in[9], final_rounding); 2766 in[10] = _mm_adds_epi16(in[10], final_rounding); 2767 in[11] = _mm_adds_epi16(in[11], final_rounding); 2768 in[12] = _mm_adds_epi16(in[12], final_rounding); 2769 in[13] = _mm_adds_epi16(in[13], final_rounding); 2770 in[14] = _mm_adds_epi16(in[14], final_rounding); 2771 in[15] = _mm_adds_epi16(in[15], final_rounding); 2772 2773 in[0] = _mm_srai_epi16(in[0], 6); 2774 in[1] = _mm_srai_epi16(in[1], 6); 2775 in[2] = _mm_srai_epi16(in[2], 6); 2776 in[3] = _mm_srai_epi16(in[3], 6); 2777 in[4] = _mm_srai_epi16(in[4], 6); 2778 in[5] = _mm_srai_epi16(in[5], 6); 2779 in[6] = _mm_srai_epi16(in[6], 6); 2780 in[7] = _mm_srai_epi16(in[7], 6); 2781 in[8] = _mm_srai_epi16(in[8], 6); 2782 in[9] = _mm_srai_epi16(in[9], 6); 2783 in[10] = _mm_srai_epi16(in[10], 6); 2784 in[11] = _mm_srai_epi16(in[11], 6); 2785 in[12] = _mm_srai_epi16(in[12], 6); 2786 in[13] = _mm_srai_epi16(in[13], 6); 2787 in[14] = _mm_srai_epi16(in[14], 6); 2788 in[15] = _mm_srai_epi16(in[15], 6); 2789 2790 RECON_AND_STORE(dest, in[0]); 2791 RECON_AND_STORE(dest, in[1]); 2792 RECON_AND_STORE(dest, in[2]); 2793 RECON_AND_STORE(dest, in[3]); 2794 RECON_AND_STORE(dest, in[4]); 2795 RECON_AND_STORE(dest, in[5]); 2796 RECON_AND_STORE(dest, in[6]); 2797 RECON_AND_STORE(dest, in[7]); 2798 RECON_AND_STORE(dest, in[8]); 2799 RECON_AND_STORE(dest, in[9]); 2800 RECON_AND_STORE(dest, in[10]); 2801 RECON_AND_STORE(dest, in[11]); 2802 RECON_AND_STORE(dest, in[12]); 2803 RECON_AND_STORE(dest, in[13]); 2804 RECON_AND_STORE(dest, in[14]); 2805 RECON_AND_STORE(dest, in[15]); 2806 2807 dest += 8 - (stride * 16); 2808 } 2809} 2810 2811#define LOAD_DQCOEFF(reg, input) \ 2812 { \ 2813 reg = _mm_load_si128((const __m128i *) input); \ 2814 input += 8; \ 2815 } \ 2816 2817#define IDCT32_34 \ 2818/* Stage1 */ \ 2819{ \ 2820 const __m128i zero = _mm_setzero_si128();\ 2821 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2822 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2823 \ 2824 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ 2825 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2826 \ 2827 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2828 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2829 \ 2830 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2831 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2832 \ 2833 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ 2834 stg1_1, stp1_16, stp1_31); \ 2835 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ 2836 stg1_7, stp1_19, stp1_28); \ 2837 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ 2838 stg1_9, stp1_20, stp1_27); \ 2839 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ 2840 stg1_15, stp1_23, stp1_24); \ 2841} \ 2842\ 2843/* Stage2 */ \ 2844{ \ 2845 const __m128i zero = _mm_setzero_si128();\ 2846 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2847 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2848 \ 2849 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2850 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2851 \ 2852 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ 2853 stg2_1, stp2_8, stp2_15); \ 2854 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ 2855 stg2_7, stp2_11, stp2_12); \ 2856 \ 2857 stp2_16 = stp1_16; \ 2858 stp2_19 = stp1_19; \ 2859 \ 2860 stp2_20 = stp1_20; \ 2861 stp2_23 = stp1_23; \ 2862 \ 2863 stp2_24 = stp1_24; \ 2864 stp2_27 = stp1_27; \ 2865 \ 2866 stp2_28 = stp1_28; \ 2867 stp2_31 = stp1_31; \ 2868} \ 2869\ 2870/* Stage3 */ \ 2871{ \ 2872 const __m128i zero = _mm_setzero_si128();\ 2873 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2874 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2875 \ 2876 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2877 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2878 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2879 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2880 \ 2881 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2882 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2883 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2884 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2885 \ 2886 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ 2887 stg3_1, stp1_4, stp1_7); \ 2888 \ 2889 stp1_8 = stp2_8; \ 2890 stp1_11 = stp2_11; \ 2891 stp1_12 = stp2_12; \ 2892 stp1_15 = stp2_15; \ 2893 \ 2894 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2895 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2896 stp1_18, stp1_29) \ 2897 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2898 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2899 stp1_22, stp1_25) \ 2900 \ 2901 stp1_16 = stp2_16; \ 2902 stp1_31 = stp2_31; \ 2903 stp1_19 = stp2_19; \ 2904 stp1_20 = stp2_20; \ 2905 stp1_23 = stp2_23; \ 2906 stp1_24 = stp2_24; \ 2907 stp1_27 = stp2_27; \ 2908 stp1_28 = stp2_28; \ 2909} \ 2910\ 2911/* Stage4 */ \ 2912{ \ 2913 const __m128i zero = _mm_setzero_si128();\ 2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2916 \ 2917 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2918 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2919 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2920 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2921 \ 2922 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ 2923 stg4_1, stp2_0, stp2_1); \ 2924 \ 2925 stp2_4 = stp1_4; \ 2926 stp2_5 = stp1_4; \ 2927 stp2_6 = stp1_7; \ 2928 stp2_7 = stp1_7; \ 2929 \ 2930 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2931 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2932 stp2_10, stp2_13) \ 2933 \ 2934 stp2_8 = stp1_8; \ 2935 stp2_15 = stp1_15; \ 2936 stp2_11 = stp1_11; \ 2937 stp2_12 = stp1_12; \ 2938 \ 2939 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2940 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2941 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2942 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2943 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2944 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2945 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2946 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2947 \ 2948 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2949 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2950 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2951 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2952 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2953 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2954 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2955 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2956} \ 2957\ 2958/* Stage5 */ \ 2959{ \ 2960 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2961 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2962 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2963 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2964 \ 2965 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2966 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2967 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2968 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2969 \ 2970 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2971 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2972 \ 2973 stp1_0 = stp2_0; \ 2974 stp1_1 = stp2_1; \ 2975 stp1_2 = stp2_1; \ 2976 stp1_3 = stp2_0; \ 2977 \ 2978 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2979 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2980 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2981 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2982 \ 2983 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2984 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2985 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2986 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2987 \ 2988 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2989 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2990 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2991 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2992 \ 2993 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2994 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2995 \ 2996 stp1_4 = stp2_4; \ 2997 stp1_7 = stp2_7; \ 2998 \ 2999 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3000 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3001 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3002 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3003 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3004 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3005 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3006 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3007 \ 3008 stp1_16 = stp2_16; \ 3009 stp1_17 = stp2_17; \ 3010 \ 3011 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3012 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3013 stp1_19, stp1_28) \ 3014 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3015 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3016 stp1_21, stp1_26) \ 3017 \ 3018 stp1_22 = stp2_22; \ 3019 stp1_23 = stp2_23; \ 3020 stp1_24 = stp2_24; \ 3021 stp1_25 = stp2_25; \ 3022 stp1_30 = stp2_30; \ 3023 stp1_31 = stp2_31; \ 3024} \ 3025\ 3026/* Stage6 */ \ 3027{ \ 3028 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3029 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3030 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3031 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3032 \ 3033 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3034 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3035 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3036 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3037 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3038 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3039 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3040 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3041 \ 3042 stp2_8 = stp1_8; \ 3043 stp2_9 = stp1_9; \ 3044 stp2_14 = stp1_14; \ 3045 stp2_15 = stp1_15; \ 3046 \ 3047 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3048 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3049 stp2_13, stp2_11, stp2_12) \ 3050 \ 3051 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3052 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3053 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3054 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3055 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3056 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3057 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3058 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3059 \ 3060 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3061 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3062 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3063 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3064 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3065 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3066 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3067 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3068} \ 3069\ 3070/* Stage7 */ \ 3071{ \ 3072 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3073 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3074 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3075 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3076 \ 3077 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3078 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3079 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3080 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3081 \ 3082 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3083 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3084 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3085 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3086 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3087 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3088 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3089 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3090 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3091 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3092 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3093 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3094 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3095 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3096 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3097 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3098 \ 3099 stp1_16 = stp2_16; \ 3100 stp1_17 = stp2_17; \ 3101 stp1_18 = stp2_18; \ 3102 stp1_19 = stp2_19; \ 3103 \ 3104 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3105 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3106 stp1_21, stp1_26) \ 3107 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3108 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3109 stp1_23, stp1_24) \ 3110 \ 3111 stp1_28 = stp2_28; \ 3112 stp1_29 = stp2_29; \ 3113 stp1_30 = stp2_30; \ 3114 stp1_31 = stp2_31; \ 3115} 3116 3117 3118#define IDCT32 \ 3119/* Stage1 */ \ 3120{ \ 3121 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 3122 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 3123 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 3124 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 3125 \ 3126 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 3127 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 3128 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ 3129 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 3130 \ 3131 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 3132 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 3133 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 3134 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 3135 \ 3136 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 3137 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 3138 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 3139 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 3140 \ 3141 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 3142 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 3143 stp1_17, stp1_30) \ 3144 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 3145 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 3146 stp1_19, stp1_28) \ 3147 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 3148 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 3149 stp1_21, stp1_26) \ 3150 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 3151 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 3152 stp1_23, stp1_24) \ 3153} \ 3154\ 3155/* Stage2 */ \ 3156{ \ 3157 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 3158 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 3159 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 3160 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 3161 \ 3162 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 3163 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 3164 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 3165 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 3166 \ 3167 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 3168 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 3169 stp2_14) \ 3170 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 3171 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 3172 stp2_11, stp2_12) \ 3173 \ 3174 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 3175 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 3176 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 3177 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 3178 \ 3179 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 3180 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 3181 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 3182 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 3183 \ 3184 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 3185 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 3186 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 3187 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 3188 \ 3189 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 3190 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 3191 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 3192 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 3193} \ 3194\ 3195/* Stage3 */ \ 3196{ \ 3197 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 3198 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 3199 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 3200 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 3201 \ 3202 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 3203 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 3204 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 3205 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 3206 \ 3207 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3208 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3209 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3210 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3211 \ 3212 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 3213 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 3214 stp1_6) \ 3215 \ 3216 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 3217 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 3218 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 3219 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 3220 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 3221 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 3222 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 3223 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 3224 \ 3225 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 3226 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 3227 stp1_18, stp1_29) \ 3228 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 3229 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 3230 stp1_22, stp1_25) \ 3231 \ 3232 stp1_16 = stp2_16; \ 3233 stp1_31 = stp2_31; \ 3234 stp1_19 = stp2_19; \ 3235 stp1_20 = stp2_20; \ 3236 stp1_23 = stp2_23; \ 3237 stp1_24 = stp2_24; \ 3238 stp1_27 = stp2_27; \ 3239 stp1_28 = stp2_28; \ 3240} \ 3241\ 3242/* Stage4 */ \ 3243{ \ 3244 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 3245 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 3246 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 3247 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 3248 \ 3249 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 3250 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 3251 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3252 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3253 \ 3254 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 3255 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 3256 stp2_2, stp2_3) \ 3257 \ 3258 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 3259 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 3260 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 3261 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 3262 \ 3263 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 3264 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 3265 stp2_10, stp2_13) \ 3266 \ 3267 stp2_8 = stp1_8; \ 3268 stp2_15 = stp1_15; \ 3269 stp2_11 = stp1_11; \ 3270 stp2_12 = stp1_12; \ 3271 \ 3272 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 3273 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 3274 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 3275 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 3276 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 3277 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 3278 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 3279 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 3280 \ 3281 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 3282 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 3283 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 3284 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 3285 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 3286 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 3287 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 3288 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 3289} \ 3290\ 3291/* Stage5 */ \ 3292{ \ 3293 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 3294 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 3295 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 3296 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 3297 \ 3298 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 3299 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 3300 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3301 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3302 \ 3303 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3304 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3305 \ 3306 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 3307 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 3308 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 3309 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 3310 \ 3311 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 3312 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 3313 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 3314 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 3315 \ 3316 tmp0 = _mm_add_epi32(tmp0, rounding); \ 3317 tmp1 = _mm_add_epi32(tmp1, rounding); \ 3318 tmp2 = _mm_add_epi32(tmp2, rounding); \ 3319 tmp3 = _mm_add_epi32(tmp3, rounding); \ 3320 \ 3321 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 3322 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 3323 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 3324 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 3325 \ 3326 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 3327 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 3328 \ 3329 stp1_4 = stp2_4; \ 3330 stp1_7 = stp2_7; \ 3331 \ 3332 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3333 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3334 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3335 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3336 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3337 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3338 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3339 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3340 \ 3341 stp1_16 = stp2_16; \ 3342 stp1_17 = stp2_17; \ 3343 \ 3344 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3345 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3346 stp1_19, stp1_28) \ 3347 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3348 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3349 stp1_21, stp1_26) \ 3350 \ 3351 stp1_22 = stp2_22; \ 3352 stp1_23 = stp2_23; \ 3353 stp1_24 = stp2_24; \ 3354 stp1_25 = stp2_25; \ 3355 stp1_30 = stp2_30; \ 3356 stp1_31 = stp2_31; \ 3357} \ 3358\ 3359/* Stage6 */ \ 3360{ \ 3361 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3362 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3363 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3364 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3365 \ 3366 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3367 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3368 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3369 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3370 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3371 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3372 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3373 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3374 \ 3375 stp2_8 = stp1_8; \ 3376 stp2_9 = stp1_9; \ 3377 stp2_14 = stp1_14; \ 3378 stp2_15 = stp1_15; \ 3379 \ 3380 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3381 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3382 stp2_13, stp2_11, stp2_12) \ 3383 \ 3384 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3385 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3386 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3387 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3388 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3389 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3390 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3391 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3392 \ 3393 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3394 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3395 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3396 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3397 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3398 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3399 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3400 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3401} \ 3402\ 3403/* Stage7 */ \ 3404{ \ 3405 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3406 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3407 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3408 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3409 \ 3410 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3411 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3412 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3413 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3414 \ 3415 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3416 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3417 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3418 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3419 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3420 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3421 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3422 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3423 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3424 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3425 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3426 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3427 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3428 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3429 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3430 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3431 \ 3432 stp1_16 = stp2_16; \ 3433 stp1_17 = stp2_17; \ 3434 stp1_18 = stp2_18; \ 3435 stp1_19 = stp2_19; \ 3436 \ 3437 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3438 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3439 stp1_21, stp1_26) \ 3440 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3441 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3442 stp1_23, stp1_24) \ 3443 \ 3444 stp1_28 = stp2_28; \ 3445 stp1_29 = stp2_29; \ 3446 stp1_30 = stp2_30; \ 3447 stp1_31 = stp2_31; \ 3448} 3449 3450// Only upper-left 8x8 has non-zero coeff 3451void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3452 int stride) { 3453 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3454 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3455 3456 // idct constants for each stage 3457 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3458 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3459 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3460 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3461 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3462 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3463 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3464 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3465 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3466 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3467 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3468 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3469 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3470 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3471 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3472 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3473 3474 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3475 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3476 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3477 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3478 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3479 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3480 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3481 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3482 3483 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3484 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3485 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3486 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3487 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3488 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3489 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3490 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3491 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3492 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3493 3494 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3495 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3496 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3497 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3498 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3499 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3500 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3501 3502 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3503 3504 __m128i in[32], col[32]; 3505 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3506 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3507 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3508 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3509 stp1_30, stp1_31; 3510 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3511 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3512 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3513 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3514 stp2_30, stp2_31; 3515 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3516 int i; 3517 // Load input data. 3518 LOAD_DQCOEFF(in[0], input); 3519 LOAD_DQCOEFF(in[8], input); 3520 LOAD_DQCOEFF(in[16], input); 3521 LOAD_DQCOEFF(in[24], input); 3522 LOAD_DQCOEFF(in[1], input); 3523 LOAD_DQCOEFF(in[9], input); 3524 LOAD_DQCOEFF(in[17], input); 3525 LOAD_DQCOEFF(in[25], input); 3526 LOAD_DQCOEFF(in[2], input); 3527 LOAD_DQCOEFF(in[10], input); 3528 LOAD_DQCOEFF(in[18], input); 3529 LOAD_DQCOEFF(in[26], input); 3530 LOAD_DQCOEFF(in[3], input); 3531 LOAD_DQCOEFF(in[11], input); 3532 LOAD_DQCOEFF(in[19], input); 3533 LOAD_DQCOEFF(in[27], input); 3534 3535 LOAD_DQCOEFF(in[4], input); 3536 LOAD_DQCOEFF(in[12], input); 3537 LOAD_DQCOEFF(in[20], input); 3538 LOAD_DQCOEFF(in[28], input); 3539 LOAD_DQCOEFF(in[5], input); 3540 LOAD_DQCOEFF(in[13], input); 3541 LOAD_DQCOEFF(in[21], input); 3542 LOAD_DQCOEFF(in[29], input); 3543 LOAD_DQCOEFF(in[6], input); 3544 LOAD_DQCOEFF(in[14], input); 3545 LOAD_DQCOEFF(in[22], input); 3546 LOAD_DQCOEFF(in[30], input); 3547 LOAD_DQCOEFF(in[7], input); 3548 LOAD_DQCOEFF(in[15], input); 3549 LOAD_DQCOEFF(in[23], input); 3550 LOAD_DQCOEFF(in[31], input); 3551 3552 array_transpose_8x8(in, in); 3553 array_transpose_8x8(in+8, in+8); 3554 array_transpose_8x8(in+16, in+16); 3555 array_transpose_8x8(in+24, in+24); 3556 3557 IDCT32 3558 3559 // 1_D: Store 32 intermediate results for each 8x32 block. 3560 col[0] = _mm_add_epi16(stp1_0, stp1_31); 3561 col[1] = _mm_add_epi16(stp1_1, stp1_30); 3562 col[2] = _mm_add_epi16(stp1_2, stp1_29); 3563 col[3] = _mm_add_epi16(stp1_3, stp1_28); 3564 col[4] = _mm_add_epi16(stp1_4, stp1_27); 3565 col[5] = _mm_add_epi16(stp1_5, stp1_26); 3566 col[6] = _mm_add_epi16(stp1_6, stp1_25); 3567 col[7] = _mm_add_epi16(stp1_7, stp1_24); 3568 col[8] = _mm_add_epi16(stp1_8, stp1_23); 3569 col[9] = _mm_add_epi16(stp1_9, stp1_22); 3570 col[10] = _mm_add_epi16(stp1_10, stp1_21); 3571 col[11] = _mm_add_epi16(stp1_11, stp1_20); 3572 col[12] = _mm_add_epi16(stp1_12, stp1_19); 3573 col[13] = _mm_add_epi16(stp1_13, stp1_18); 3574 col[14] = _mm_add_epi16(stp1_14, stp1_17); 3575 col[15] = _mm_add_epi16(stp1_15, stp1_16); 3576 col[16] = _mm_sub_epi16(stp1_15, stp1_16); 3577 col[17] = _mm_sub_epi16(stp1_14, stp1_17); 3578 col[18] = _mm_sub_epi16(stp1_13, stp1_18); 3579 col[19] = _mm_sub_epi16(stp1_12, stp1_19); 3580 col[20] = _mm_sub_epi16(stp1_11, stp1_20); 3581 col[21] = _mm_sub_epi16(stp1_10, stp1_21); 3582 col[22] = _mm_sub_epi16(stp1_9, stp1_22); 3583 col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3584 col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3585 col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3586 col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3587 col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3588 col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3589 col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3590 col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3591 col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3592 for (i = 0; i < 4; i++) { 3593 const __m128i zero = _mm_setzero_si128(); 3594 // Transpose 32x8 block to 8x32 block 3595 array_transpose_8x8(col+i*8, in); 3596 IDCT32_34 3597 3598 // 2_D: Calculate the results and store them to destination. 3599 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3600 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3601 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3602 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3603 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3604 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3605 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3606 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3607 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3608 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3609 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3610 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3611 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3612 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3613 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3614 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3615 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3616 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3617 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3618 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3619 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3620 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3621 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3622 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3623 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3624 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3625 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3626 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3627 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3628 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3629 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3630 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3631 3632 // Final rounding and shift 3633 in[0] = _mm_adds_epi16(in[0], final_rounding); 3634 in[1] = _mm_adds_epi16(in[1], final_rounding); 3635 in[2] = _mm_adds_epi16(in[2], final_rounding); 3636 in[3] = _mm_adds_epi16(in[3], final_rounding); 3637 in[4] = _mm_adds_epi16(in[4], final_rounding); 3638 in[5] = _mm_adds_epi16(in[5], final_rounding); 3639 in[6] = _mm_adds_epi16(in[6], final_rounding); 3640 in[7] = _mm_adds_epi16(in[7], final_rounding); 3641 in[8] = _mm_adds_epi16(in[8], final_rounding); 3642 in[9] = _mm_adds_epi16(in[9], final_rounding); 3643 in[10] = _mm_adds_epi16(in[10], final_rounding); 3644 in[11] = _mm_adds_epi16(in[11], final_rounding); 3645 in[12] = _mm_adds_epi16(in[12], final_rounding); 3646 in[13] = _mm_adds_epi16(in[13], final_rounding); 3647 in[14] = _mm_adds_epi16(in[14], final_rounding); 3648 in[15] = _mm_adds_epi16(in[15], final_rounding); 3649 in[16] = _mm_adds_epi16(in[16], final_rounding); 3650 in[17] = _mm_adds_epi16(in[17], final_rounding); 3651 in[18] = _mm_adds_epi16(in[18], final_rounding); 3652 in[19] = _mm_adds_epi16(in[19], final_rounding); 3653 in[20] = _mm_adds_epi16(in[20], final_rounding); 3654 in[21] = _mm_adds_epi16(in[21], final_rounding); 3655 in[22] = _mm_adds_epi16(in[22], final_rounding); 3656 in[23] = _mm_adds_epi16(in[23], final_rounding); 3657 in[24] = _mm_adds_epi16(in[24], final_rounding); 3658 in[25] = _mm_adds_epi16(in[25], final_rounding); 3659 in[26] = _mm_adds_epi16(in[26], final_rounding); 3660 in[27] = _mm_adds_epi16(in[27], final_rounding); 3661 in[28] = _mm_adds_epi16(in[28], final_rounding); 3662 in[29] = _mm_adds_epi16(in[29], final_rounding); 3663 in[30] = _mm_adds_epi16(in[30], final_rounding); 3664 in[31] = _mm_adds_epi16(in[31], final_rounding); 3665 3666 in[0] = _mm_srai_epi16(in[0], 6); 3667 in[1] = _mm_srai_epi16(in[1], 6); 3668 in[2] = _mm_srai_epi16(in[2], 6); 3669 in[3] = _mm_srai_epi16(in[3], 6); 3670 in[4] = _mm_srai_epi16(in[4], 6); 3671 in[5] = _mm_srai_epi16(in[5], 6); 3672 in[6] = _mm_srai_epi16(in[6], 6); 3673 in[7] = _mm_srai_epi16(in[7], 6); 3674 in[8] = _mm_srai_epi16(in[8], 6); 3675 in[9] = _mm_srai_epi16(in[9], 6); 3676 in[10] = _mm_srai_epi16(in[10], 6); 3677 in[11] = _mm_srai_epi16(in[11], 6); 3678 in[12] = _mm_srai_epi16(in[12], 6); 3679 in[13] = _mm_srai_epi16(in[13], 6); 3680 in[14] = _mm_srai_epi16(in[14], 6); 3681 in[15] = _mm_srai_epi16(in[15], 6); 3682 in[16] = _mm_srai_epi16(in[16], 6); 3683 in[17] = _mm_srai_epi16(in[17], 6); 3684 in[18] = _mm_srai_epi16(in[18], 6); 3685 in[19] = _mm_srai_epi16(in[19], 6); 3686 in[20] = _mm_srai_epi16(in[20], 6); 3687 in[21] = _mm_srai_epi16(in[21], 6); 3688 in[22] = _mm_srai_epi16(in[22], 6); 3689 in[23] = _mm_srai_epi16(in[23], 6); 3690 in[24] = _mm_srai_epi16(in[24], 6); 3691 in[25] = _mm_srai_epi16(in[25], 6); 3692 in[26] = _mm_srai_epi16(in[26], 6); 3693 in[27] = _mm_srai_epi16(in[27], 6); 3694 in[28] = _mm_srai_epi16(in[28], 6); 3695 in[29] = _mm_srai_epi16(in[29], 6); 3696 in[30] = _mm_srai_epi16(in[30], 6); 3697 in[31] = _mm_srai_epi16(in[31], 6); 3698 3699 RECON_AND_STORE(dest, in[0]); 3700 RECON_AND_STORE(dest, in[1]); 3701 RECON_AND_STORE(dest, in[2]); 3702 RECON_AND_STORE(dest, in[3]); 3703 RECON_AND_STORE(dest, in[4]); 3704 RECON_AND_STORE(dest, in[5]); 3705 RECON_AND_STORE(dest, in[6]); 3706 RECON_AND_STORE(dest, in[7]); 3707 RECON_AND_STORE(dest, in[8]); 3708 RECON_AND_STORE(dest, in[9]); 3709 RECON_AND_STORE(dest, in[10]); 3710 RECON_AND_STORE(dest, in[11]); 3711 RECON_AND_STORE(dest, in[12]); 3712 RECON_AND_STORE(dest, in[13]); 3713 RECON_AND_STORE(dest, in[14]); 3714 RECON_AND_STORE(dest, in[15]); 3715 RECON_AND_STORE(dest, in[16]); 3716 RECON_AND_STORE(dest, in[17]); 3717 RECON_AND_STORE(dest, in[18]); 3718 RECON_AND_STORE(dest, in[19]); 3719 RECON_AND_STORE(dest, in[20]); 3720 RECON_AND_STORE(dest, in[21]); 3721 RECON_AND_STORE(dest, in[22]); 3722 RECON_AND_STORE(dest, in[23]); 3723 RECON_AND_STORE(dest, in[24]); 3724 RECON_AND_STORE(dest, in[25]); 3725 RECON_AND_STORE(dest, in[26]); 3726 RECON_AND_STORE(dest, in[27]); 3727 RECON_AND_STORE(dest, in[28]); 3728 RECON_AND_STORE(dest, in[29]); 3729 RECON_AND_STORE(dest, in[30]); 3730 RECON_AND_STORE(dest, in[31]); 3731 3732 dest += 8 - (stride * 32); 3733 } 3734 } 3735 3736void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3737 int stride) { 3738 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3739 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3740 3741 // idct constants for each stage 3742 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3743 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3744 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3745 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3746 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3747 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3748 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3749 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3750 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3751 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3752 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3753 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3754 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3755 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3756 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3757 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3758 3759 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3760 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3761 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3762 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3763 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3764 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3765 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3766 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3767 3768 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3769 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3770 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3771 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3772 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3773 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3774 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3775 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3776 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3777 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3778 3779 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3780 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3781 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3782 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3783 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3784 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3785 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3786 3787 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3788 3789 __m128i in[32], col[128], zero_idx[16]; 3790 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3791 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3792 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3793 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3794 stp1_30, stp1_31; 3795 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3796 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3797 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3798 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3799 stp2_30, stp2_31; 3800 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3801 int i, j, i32; 3802 int zero_flag[2]; 3803 3804 for (i = 0; i < 4; i++) { 3805 i32 = (i << 5); 3806 // First 1-D idct 3807 // Load input data. 3808 LOAD_DQCOEFF(in[0], input); 3809 LOAD_DQCOEFF(in[8], input); 3810 LOAD_DQCOEFF(in[16], input); 3811 LOAD_DQCOEFF(in[24], input); 3812 LOAD_DQCOEFF(in[1], input); 3813 LOAD_DQCOEFF(in[9], input); 3814 LOAD_DQCOEFF(in[17], input); 3815 LOAD_DQCOEFF(in[25], input); 3816 LOAD_DQCOEFF(in[2], input); 3817 LOAD_DQCOEFF(in[10], input); 3818 LOAD_DQCOEFF(in[18], input); 3819 LOAD_DQCOEFF(in[26], input); 3820 LOAD_DQCOEFF(in[3], input); 3821 LOAD_DQCOEFF(in[11], input); 3822 LOAD_DQCOEFF(in[19], input); 3823 LOAD_DQCOEFF(in[27], input); 3824 3825 LOAD_DQCOEFF(in[4], input); 3826 LOAD_DQCOEFF(in[12], input); 3827 LOAD_DQCOEFF(in[20], input); 3828 LOAD_DQCOEFF(in[28], input); 3829 LOAD_DQCOEFF(in[5], input); 3830 LOAD_DQCOEFF(in[13], input); 3831 LOAD_DQCOEFF(in[21], input); 3832 LOAD_DQCOEFF(in[29], input); 3833 LOAD_DQCOEFF(in[6], input); 3834 LOAD_DQCOEFF(in[14], input); 3835 LOAD_DQCOEFF(in[22], input); 3836 LOAD_DQCOEFF(in[30], input); 3837 LOAD_DQCOEFF(in[7], input); 3838 LOAD_DQCOEFF(in[15], input); 3839 LOAD_DQCOEFF(in[23], input); 3840 LOAD_DQCOEFF(in[31], input); 3841 3842 // checking if all entries are zero 3843 zero_idx[0] = _mm_or_si128(in[0], in[1]); 3844 zero_idx[1] = _mm_or_si128(in[2], in[3]); 3845 zero_idx[2] = _mm_or_si128(in[4], in[5]); 3846 zero_idx[3] = _mm_or_si128(in[6], in[7]); 3847 zero_idx[4] = _mm_or_si128(in[8], in[9]); 3848 zero_idx[5] = _mm_or_si128(in[10], in[11]); 3849 zero_idx[6] = _mm_or_si128(in[12], in[13]); 3850 zero_idx[7] = _mm_or_si128(in[14], in[15]); 3851 zero_idx[8] = _mm_or_si128(in[16], in[17]); 3852 zero_idx[9] = _mm_or_si128(in[18], in[19]); 3853 zero_idx[10] = _mm_or_si128(in[20], in[21]); 3854 zero_idx[11] = _mm_or_si128(in[22], in[23]); 3855 zero_idx[12] = _mm_or_si128(in[24], in[25]); 3856 zero_idx[13] = _mm_or_si128(in[26], in[27]); 3857 zero_idx[14] = _mm_or_si128(in[28], in[29]); 3858 zero_idx[15] = _mm_or_si128(in[30], in[31]); 3859 3860 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3861 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3862 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3863 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3864 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3865 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3866 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3867 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3868 3869 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3870 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3871 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3872 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3873 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3874 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3875 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3876 3877 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 3878 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); 3879 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); 3880 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); 3881 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); 3882 3883 if (!zero_flag[0] && !zero_flag[1]) { 3884 col[i32 + 0] = _mm_setzero_si128(); 3885 col[i32 + 1] = _mm_setzero_si128(); 3886 col[i32 + 2] = _mm_setzero_si128(); 3887 col[i32 + 3] = _mm_setzero_si128(); 3888 col[i32 + 4] = _mm_setzero_si128(); 3889 col[i32 + 5] = _mm_setzero_si128(); 3890 col[i32 + 6] = _mm_setzero_si128(); 3891 col[i32 + 7] = _mm_setzero_si128(); 3892 col[i32 + 8] = _mm_setzero_si128(); 3893 col[i32 + 9] = _mm_setzero_si128(); 3894 col[i32 + 10] = _mm_setzero_si128(); 3895 col[i32 + 11] = _mm_setzero_si128(); 3896 col[i32 + 12] = _mm_setzero_si128(); 3897 col[i32 + 13] = _mm_setzero_si128(); 3898 col[i32 + 14] = _mm_setzero_si128(); 3899 col[i32 + 15] = _mm_setzero_si128(); 3900 col[i32 + 16] = _mm_setzero_si128(); 3901 col[i32 + 17] = _mm_setzero_si128(); 3902 col[i32 + 18] = _mm_setzero_si128(); 3903 col[i32 + 19] = _mm_setzero_si128(); 3904 col[i32 + 20] = _mm_setzero_si128(); 3905 col[i32 + 21] = _mm_setzero_si128(); 3906 col[i32 + 22] = _mm_setzero_si128(); 3907 col[i32 + 23] = _mm_setzero_si128(); 3908 col[i32 + 24] = _mm_setzero_si128(); 3909 col[i32 + 25] = _mm_setzero_si128(); 3910 col[i32 + 26] = _mm_setzero_si128(); 3911 col[i32 + 27] = _mm_setzero_si128(); 3912 col[i32 + 28] = _mm_setzero_si128(); 3913 col[i32 + 29] = _mm_setzero_si128(); 3914 col[i32 + 30] = _mm_setzero_si128(); 3915 col[i32 + 31] = _mm_setzero_si128(); 3916 continue; 3917 } 3918 3919 // Transpose 32x8 block to 8x32 block 3920 array_transpose_8x8(in, in); 3921 array_transpose_8x8(in+8, in+8); 3922 array_transpose_8x8(in+16, in+16); 3923 array_transpose_8x8(in+24, in+24); 3924 3925 IDCT32 3926 3927 // 1_D: Store 32 intermediate results for each 8x32 block. 3928 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3929 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3930 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3931 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3932 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3933 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3934 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3935 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3936 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3937 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3938 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3939 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3940 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3941 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3942 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3943 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3944 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3945 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3946 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3947 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3948 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3949 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3950 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3951 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3952 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3953 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3954 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3955 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3956 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3957 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3958 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3959 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3960 } 3961 for (i = 0; i < 4; i++) { 3962 const __m128i zero = _mm_setzero_si128(); 3963 // Second 1-D idct 3964 j = i << 3; 3965 3966 // Transpose 32x8 block to 8x32 block 3967 array_transpose_8x8(col+j, in); 3968 array_transpose_8x8(col+j+32, in+8); 3969 array_transpose_8x8(col+j+64, in+16); 3970 array_transpose_8x8(col+j+96, in+24); 3971 3972 IDCT32 3973 3974 // 2_D: Calculate the results and store them to destination. 3975 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3976 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3977 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3978 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3979 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3980 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3981 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3982 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3983 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3984 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3985 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3986 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3987 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3988 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3989 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3990 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3991 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3992 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3993 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3994 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3995 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3996 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3997 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3998 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3999 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 4000 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 4001 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 4002 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 4003 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 4004 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 4005 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 4006 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 4007 4008 // Final rounding and shift 4009 in[0] = _mm_adds_epi16(in[0], final_rounding); 4010 in[1] = _mm_adds_epi16(in[1], final_rounding); 4011 in[2] = _mm_adds_epi16(in[2], final_rounding); 4012 in[3] = _mm_adds_epi16(in[3], final_rounding); 4013 in[4] = _mm_adds_epi16(in[4], final_rounding); 4014 in[5] = _mm_adds_epi16(in[5], final_rounding); 4015 in[6] = _mm_adds_epi16(in[6], final_rounding); 4016 in[7] = _mm_adds_epi16(in[7], final_rounding); 4017 in[8] = _mm_adds_epi16(in[8], final_rounding); 4018 in[9] = _mm_adds_epi16(in[9], final_rounding); 4019 in[10] = _mm_adds_epi16(in[10], final_rounding); 4020 in[11] = _mm_adds_epi16(in[11], final_rounding); 4021 in[12] = _mm_adds_epi16(in[12], final_rounding); 4022 in[13] = _mm_adds_epi16(in[13], final_rounding); 4023 in[14] = _mm_adds_epi16(in[14], final_rounding); 4024 in[15] = _mm_adds_epi16(in[15], final_rounding); 4025 in[16] = _mm_adds_epi16(in[16], final_rounding); 4026 in[17] = _mm_adds_epi16(in[17], final_rounding); 4027 in[18] = _mm_adds_epi16(in[18], final_rounding); 4028 in[19] = _mm_adds_epi16(in[19], final_rounding); 4029 in[20] = _mm_adds_epi16(in[20], final_rounding); 4030 in[21] = _mm_adds_epi16(in[21], final_rounding); 4031 in[22] = _mm_adds_epi16(in[22], final_rounding); 4032 in[23] = _mm_adds_epi16(in[23], final_rounding); 4033 in[24] = _mm_adds_epi16(in[24], final_rounding); 4034 in[25] = _mm_adds_epi16(in[25], final_rounding); 4035 in[26] = _mm_adds_epi16(in[26], final_rounding); 4036 in[27] = _mm_adds_epi16(in[27], final_rounding); 4037 in[28] = _mm_adds_epi16(in[28], final_rounding); 4038 in[29] = _mm_adds_epi16(in[29], final_rounding); 4039 in[30] = _mm_adds_epi16(in[30], final_rounding); 4040 in[31] = _mm_adds_epi16(in[31], final_rounding); 4041 4042 in[0] = _mm_srai_epi16(in[0], 6); 4043 in[1] = _mm_srai_epi16(in[1], 6); 4044 in[2] = _mm_srai_epi16(in[2], 6); 4045 in[3] = _mm_srai_epi16(in[3], 6); 4046 in[4] = _mm_srai_epi16(in[4], 6); 4047 in[5] = _mm_srai_epi16(in[5], 6); 4048 in[6] = _mm_srai_epi16(in[6], 6); 4049 in[7] = _mm_srai_epi16(in[7], 6); 4050 in[8] = _mm_srai_epi16(in[8], 6); 4051 in[9] = _mm_srai_epi16(in[9], 6); 4052 in[10] = _mm_srai_epi16(in[10], 6); 4053 in[11] = _mm_srai_epi16(in[11], 6); 4054 in[12] = _mm_srai_epi16(in[12], 6); 4055 in[13] = _mm_srai_epi16(in[13], 6); 4056 in[14] = _mm_srai_epi16(in[14], 6); 4057 in[15] = _mm_srai_epi16(in[15], 6); 4058 in[16] = _mm_srai_epi16(in[16], 6); 4059 in[17] = _mm_srai_epi16(in[17], 6); 4060 in[18] = _mm_srai_epi16(in[18], 6); 4061 in[19] = _mm_srai_epi16(in[19], 6); 4062 in[20] = _mm_srai_epi16(in[20], 6); 4063 in[21] = _mm_srai_epi16(in[21], 6); 4064 in[22] = _mm_srai_epi16(in[22], 6); 4065 in[23] = _mm_srai_epi16(in[23], 6); 4066 in[24] = _mm_srai_epi16(in[24], 6); 4067 in[25] = _mm_srai_epi16(in[25], 6); 4068 in[26] = _mm_srai_epi16(in[26], 6); 4069 in[27] = _mm_srai_epi16(in[27], 6); 4070 in[28] = _mm_srai_epi16(in[28], 6); 4071 in[29] = _mm_srai_epi16(in[29], 6); 4072 in[30] = _mm_srai_epi16(in[30], 6); 4073 in[31] = _mm_srai_epi16(in[31], 6); 4074 4075 RECON_AND_STORE(dest, in[0]); 4076 RECON_AND_STORE(dest, in[1]); 4077 RECON_AND_STORE(dest, in[2]); 4078 RECON_AND_STORE(dest, in[3]); 4079 RECON_AND_STORE(dest, in[4]); 4080 RECON_AND_STORE(dest, in[5]); 4081 RECON_AND_STORE(dest, in[6]); 4082 RECON_AND_STORE(dest, in[7]); 4083 RECON_AND_STORE(dest, in[8]); 4084 RECON_AND_STORE(dest, in[9]); 4085 RECON_AND_STORE(dest, in[10]); 4086 RECON_AND_STORE(dest, in[11]); 4087 RECON_AND_STORE(dest, in[12]); 4088 RECON_AND_STORE(dest, in[13]); 4089 RECON_AND_STORE(dest, in[14]); 4090 RECON_AND_STORE(dest, in[15]); 4091 RECON_AND_STORE(dest, in[16]); 4092 RECON_AND_STORE(dest, in[17]); 4093 RECON_AND_STORE(dest, in[18]); 4094 RECON_AND_STORE(dest, in[19]); 4095 RECON_AND_STORE(dest, in[20]); 4096 RECON_AND_STORE(dest, in[21]); 4097 RECON_AND_STORE(dest, in[22]); 4098 RECON_AND_STORE(dest, in[23]); 4099 RECON_AND_STORE(dest, in[24]); 4100 RECON_AND_STORE(dest, in[25]); 4101 RECON_AND_STORE(dest, in[26]); 4102 RECON_AND_STORE(dest, in[27]); 4103 RECON_AND_STORE(dest, in[28]); 4104 RECON_AND_STORE(dest, in[29]); 4105 RECON_AND_STORE(dest, in[30]); 4106 RECON_AND_STORE(dest, in[31]); 4107 4108 dest += 8 - (stride * 32); 4109 } 4110} //NOLINT 4111 4112void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 4113 __m128i dc_value; 4114 const __m128i zero = _mm_setzero_si128(); 4115 int a, i; 4116 4117 a = dct_const_round_shift(input[0] * cospi_16_64); 4118 a = dct_const_round_shift(a * cospi_16_64); 4119 a = ROUND_POWER_OF_TWO(a, 6); 4120 4121 dc_value = _mm_set1_epi16(a); 4122 4123 for (i = 0; i < 4; ++i) { 4124 RECON_AND_STORE(dest, dc_value); 4125 RECON_AND_STORE(dest, dc_value); 4126 RECON_AND_STORE(dest, dc_value); 4127 RECON_AND_STORE(dest, dc_value); 4128 RECON_AND_STORE(dest, dc_value); 4129 RECON_AND_STORE(dest, dc_value); 4130 RECON_AND_STORE(dest, dc_value); 4131 RECON_AND_STORE(dest, dc_value); 4132 RECON_AND_STORE(dest, dc_value); 4133 RECON_AND_STORE(dest, dc_value); 4134 RECON_AND_STORE(dest, dc_value); 4135 RECON_AND_STORE(dest, dc_value); 4136 RECON_AND_STORE(dest, dc_value); 4137 RECON_AND_STORE(dest, dc_value); 4138 RECON_AND_STORE(dest, dc_value); 4139 RECON_AND_STORE(dest, dc_value); 4140 RECON_AND_STORE(dest, dc_value); 4141 RECON_AND_STORE(dest, dc_value); 4142 RECON_AND_STORE(dest, dc_value); 4143 RECON_AND_STORE(dest, dc_value); 4144 RECON_AND_STORE(dest, dc_value); 4145 RECON_AND_STORE(dest, dc_value); 4146 RECON_AND_STORE(dest, dc_value); 4147 RECON_AND_STORE(dest, dc_value); 4148 RECON_AND_STORE(dest, dc_value); 4149 RECON_AND_STORE(dest, dc_value); 4150 RECON_AND_STORE(dest, dc_value); 4151 RECON_AND_STORE(dest, dc_value); 4152 RECON_AND_STORE(dest, dc_value); 4153 RECON_AND_STORE(dest, dc_value); 4154 RECON_AND_STORE(dest, dc_value); 4155 RECON_AND_STORE(dest, dc_value); 4156 dest += 8 - (stride * 32); 4157 } 4158} 4159