1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13#include "./vpx_config.h" 14#include "vpx/vpx_integer.h" 15#include "vp9/common/vp9_common.h" 16#include "vp9/common/vp9_idct.h" 17 18#define RECON_AND_STORE4X4(dest, in_x) \ 19{ \ 20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 21 d0 = _mm_unpacklo_epi8(d0, zero); \ 22 d0 = _mm_add_epi16(in_x, d0); \ 23 d0 = _mm_packus_epi16(d0, d0); \ 24 *(int *)dest = _mm_cvtsi128_si32(d0); \ 25 dest += stride; \ 26} 27 28void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 29 const __m128i zero = _mm_setzero_si128(); 30 const __m128i eight = _mm_set1_epi16(8); 31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 34 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 36 __m128i input0, input1, input2, input3; 37 38 // Rows 39 input0 = _mm_load_si128((const __m128i *)input); 40 input2 = _mm_load_si128((const __m128i *)(input + 8)); 41 42 // Construct i3, i1, i3, i1, i2, i0, i2, i0 43 input0 = _mm_shufflelo_epi16(input0, 0xd8); 44 input0 = _mm_shufflehi_epi16(input0, 0xd8); 45 input2 = _mm_shufflelo_epi16(input2, 0xd8); 46 input2 = _mm_shufflehi_epi16(input2, 0xd8); 47 48 input1 = _mm_unpackhi_epi32(input0, input0); 49 input0 = _mm_unpacklo_epi32(input0, input0); 50 input3 = _mm_unpackhi_epi32(input2, input2); 51 input2 = _mm_unpacklo_epi32(input2, input2); 52 53 // Stage 1 54 input0 = _mm_madd_epi16(input0, cst); 55 input1 = _mm_madd_epi16(input1, cst); 56 input2 = _mm_madd_epi16(input2, cst); 57 input3 = _mm_madd_epi16(input3, cst); 58 59 input0 = _mm_add_epi32(input0, rounding); 60 input1 = _mm_add_epi32(input1, rounding); 61 input2 = _mm_add_epi32(input2, rounding); 62 input3 = _mm_add_epi32(input3, rounding); 63 64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 68 69 // Stage 2 70 input0 = _mm_packs_epi32(input0, input1); 71 input1 = _mm_packs_epi32(input2, input3); 72 73 // Transpose 74 input2 = _mm_unpacklo_epi16(input0, input1); 75 input3 = _mm_unpackhi_epi16(input0, input1); 76 input0 = _mm_unpacklo_epi32(input2, input3); 77 input1 = _mm_unpackhi_epi32(input2, input3); 78 79 // Switch column2, column 3, and then, we got: 80 // input2: column1, column 0; input3: column2, column 3. 81 input1 = _mm_shuffle_epi32(input1, 0x4e); 82 input2 = _mm_add_epi16(input0, input1); 83 input3 = _mm_sub_epi16(input0, input1); 84 85 // Columns 86 // Construct i3, i1, i3, i1, i2, i0, i2, i0 87 input0 = _mm_unpacklo_epi32(input2, input2); 88 input1 = _mm_unpackhi_epi32(input2, input2); 89 input2 = _mm_unpackhi_epi32(input3, input3); 90 input3 = _mm_unpacklo_epi32(input3, input3); 91 92 // Stage 1 93 input0 = _mm_madd_epi16(input0, cst); 94 input1 = _mm_madd_epi16(input1, cst); 95 input2 = _mm_madd_epi16(input2, cst); 96 input3 = _mm_madd_epi16(input3, cst); 97 98 input0 = _mm_add_epi32(input0, rounding); 99 input1 = _mm_add_epi32(input1, rounding); 100 input2 = _mm_add_epi32(input2, rounding); 101 input3 = _mm_add_epi32(input3, rounding); 102 103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 107 108 // Stage 2 109 input0 = _mm_packs_epi32(input0, input2); 110 input1 = _mm_packs_epi32(input1, input3); 111 112 // Transpose 113 input2 = _mm_unpacklo_epi16(input0, input1); 114 input3 = _mm_unpackhi_epi16(input0, input1); 115 input0 = _mm_unpacklo_epi32(input2, input3); 116 input1 = _mm_unpackhi_epi32(input2, input3); 117 118 // Switch column2, column 3, and then, we got: 119 // input2: column1, column 0; input3: column2, column 3. 120 input1 = _mm_shuffle_epi32(input1, 0x4e); 121 input2 = _mm_add_epi16(input0, input1); 122 input3 = _mm_sub_epi16(input0, input1); 123 124 // Final round and shift 125 input2 = _mm_add_epi16(input2, eight); 126 input3 = _mm_add_epi16(input3, eight); 127 128 input2 = _mm_srai_epi16(input2, 4); 129 input3 = _mm_srai_epi16(input3, 4); 130 131 // Reconstruction and Store 132 { 133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 135 d0 = _mm_unpacklo_epi32(d0, 136 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 138 *(const int *) (dest + stride * 3)), d2); 139 d0 = _mm_unpacklo_epi8(d0, zero); 140 d2 = _mm_unpacklo_epi8(d2, zero); 141 d0 = _mm_add_epi16(d0, input2); 142 d2 = _mm_add_epi16(d2, input3); 143 d0 = _mm_packus_epi16(d0, d2); 144 // store input0 145 *(int *)dest = _mm_cvtsi128_si32(d0); 146 // store input1 147 d0 = _mm_srli_si128(d0, 4); 148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 149 // store input2 150 d0 = _mm_srli_si128(d0, 4); 151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 152 // store input3 153 d0 = _mm_srli_si128(d0, 4); 154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 155 } 156} 157 158void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 159 __m128i dc_value; 160 const __m128i zero = _mm_setzero_si128(); 161 int a; 162 163 a = dct_const_round_shift(input[0] * cospi_16_64); 164 a = dct_const_round_shift(a * cospi_16_64); 165 a = ROUND_POWER_OF_TWO(a, 4); 166 167 dc_value = _mm_set1_epi16(a); 168 169 RECON_AND_STORE4X4(dest, dc_value); 170 RECON_AND_STORE4X4(dest, dc_value); 171 RECON_AND_STORE4X4(dest, dc_value); 172 RECON_AND_STORE4X4(dest, dc_value); 173} 174 175static INLINE void transpose_4x4(__m128i *res) { 176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 177 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); 178 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 179 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 180 181 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 182 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 183} 184 185static void idct4_1d_sse2(__m128i *in) { 186 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 187 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 188 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 189 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 190 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 191 __m128i u[8], v[8]; 192 193 transpose_4x4(in); 194 // stage 1 195 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 196 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 197 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 198 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 199 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 200 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 201 202 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 203 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 204 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 205 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 206 207 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 208 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 209 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 210 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 211 212 u[0] = _mm_packs_epi32(v[0], v[2]); 213 u[1] = _mm_packs_epi32(v[1], v[3]); 214 u[2] = _mm_unpackhi_epi64(u[0], u[0]); 215 u[3] = _mm_unpackhi_epi64(u[1], u[1]); 216 217 // stage 2 218 in[0] = _mm_add_epi16(u[0], u[3]); 219 in[1] = _mm_add_epi16(u[1], u[2]); 220 in[2] = _mm_sub_epi16(u[1], u[2]); 221 in[3] = _mm_sub_epi16(u[0], u[3]); 222} 223 224static void iadst4_1d_sse2(__m128i *in) { 225 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 226 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 227 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 228 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 229 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 230 const __m128i kZero = _mm_set1_epi16(0); 231 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 232 __m128i u[8], v[8], in7; 233 234 transpose_4x4(in); 235 in7 = _mm_add_epi16(in[0], in[3]); 236 in7 = _mm_sub_epi16(in7, in[2]); 237 238 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 239 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 240 u[2] = _mm_unpacklo_epi16(in7, kZero); 241 u[3] = _mm_unpacklo_epi16(in[1], kZero); 242 243 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 244 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 245 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 246 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 247 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 248 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 249 250 u[0] = _mm_add_epi32(v[0], v[1]); 251 u[1] = _mm_add_epi32(v[3], v[4]); 252 u[2] = v[2]; 253 u[3] = _mm_add_epi32(u[0], u[1]); 254 u[4] = _mm_slli_epi32(v[5], 2); 255 u[5] = _mm_add_epi32(u[3], v[5]); 256 u[6] = _mm_sub_epi32(u[5], u[4]); 257 258 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 259 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 260 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 261 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 262 263 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 264 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 265 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 266 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 267 268 in[0] = _mm_packs_epi32(u[0], u[2]); 269 in[1] = _mm_packs_epi32(u[1], u[3]); 270 in[2] = _mm_unpackhi_epi64(in[0], in[0]); 271 in[3] = _mm_unpackhi_epi64(in[1], in[1]); 272} 273 274void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 275 int tx_type) { 276 __m128i in[4]; 277 const __m128i zero = _mm_setzero_si128(); 278 const __m128i eight = _mm_set1_epi16(8); 279 280 in[0] = _mm_loadl_epi64((const __m128i *)input); 281 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); 282 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); 283 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); 284 285 switch (tx_type) { 286 case 0: // DCT_DCT 287 idct4_1d_sse2(in); 288 idct4_1d_sse2(in); 289 break; 290 case 1: // ADST_DCT 291 idct4_1d_sse2(in); 292 iadst4_1d_sse2(in); 293 break; 294 case 2: // DCT_ADST 295 iadst4_1d_sse2(in); 296 idct4_1d_sse2(in); 297 break; 298 case 3: // ADST_ADST 299 iadst4_1d_sse2(in); 300 iadst4_1d_sse2(in); 301 break; 302 default: 303 assert(0); 304 break; 305 } 306 307 // Final round and shift 308 in[0] = _mm_add_epi16(in[0], eight); 309 in[1] = _mm_add_epi16(in[1], eight); 310 in[2] = _mm_add_epi16(in[2], eight); 311 in[3] = _mm_add_epi16(in[3], eight); 312 313 in[0] = _mm_srai_epi16(in[0], 4); 314 in[1] = _mm_srai_epi16(in[1], 4); 315 in[2] = _mm_srai_epi16(in[2], 4); 316 in[3] = _mm_srai_epi16(in[3], 4); 317 318 RECON_AND_STORE4X4(dest, in[0]); 319 RECON_AND_STORE4X4(dest, in[1]); 320 RECON_AND_STORE4X4(dest, in[2]); 321 RECON_AND_STORE4X4(dest, in[3]); 322} 323 324#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 325 out0, out1, out2, out3, out4, out5, out6, out7) \ 326 { \ 327 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 328 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 329 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 330 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 331 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 332 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 333 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 334 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 335 \ 336 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 337 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 338 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 339 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 340 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 341 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 342 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 343 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 344 \ 345 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 346 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 347 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 348 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 349 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 350 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 351 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 352 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 353 } 354 355#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 356 out0, out1, out2, out3, out4, out5, out6, out7) \ 357 { \ 358 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 359 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 360 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 361 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 362 \ 363 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 364 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 365 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 366 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 367 \ 368 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 369 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 370 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 371 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 372 out4 = out5 = out6 = out7 = zero; \ 373 } 374 375#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ 376 { \ 377 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 378 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 379 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 380 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 381 \ 382 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 383 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 384 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ 385 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ 386 } 387 388// Define Macro for multiplying elements by constants and adding them together. 389#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 390 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 391 { \ 392 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 393 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 394 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 395 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 396 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 397 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 398 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 399 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 400 \ 401 tmp0 = _mm_add_epi32(tmp0, rounding); \ 402 tmp1 = _mm_add_epi32(tmp1, rounding); \ 403 tmp2 = _mm_add_epi32(tmp2, rounding); \ 404 tmp3 = _mm_add_epi32(tmp3, rounding); \ 405 tmp4 = _mm_add_epi32(tmp4, rounding); \ 406 tmp5 = _mm_add_epi32(tmp5, rounding); \ 407 tmp6 = _mm_add_epi32(tmp6, rounding); \ 408 tmp7 = _mm_add_epi32(tmp7, rounding); \ 409 \ 410 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 411 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 412 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 413 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 414 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 415 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 416 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 417 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 418 \ 419 res0 = _mm_packs_epi32(tmp0, tmp1); \ 420 res1 = _mm_packs_epi32(tmp2, tmp3); \ 421 res2 = _mm_packs_epi32(tmp4, tmp5); \ 422 res3 = _mm_packs_epi32(tmp6, tmp7); \ 423 } 424 425#define IDCT8_1D \ 426 /* Stage1 */ \ 427 { \ 428 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 429 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 430 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 431 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 432 \ 433 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 434 stg1_1, stg1_2, stg1_3, stp1_4, \ 435 stp1_7, stp1_5, stp1_6) \ 436 } \ 437 \ 438 /* Stage2 */ \ 439 { \ 440 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 441 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 442 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 443 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 444 \ 445 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 446 stg2_1, stg2_2, stg2_3, stp2_0, \ 447 stp2_1, stp2_2, stp2_3) \ 448 \ 449 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 450 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 451 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 452 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 453 } \ 454 \ 455 /* Stage3 */ \ 456 { \ 457 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 458 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 459 \ 460 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 461 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 462 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 463 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 464 \ 465 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 466 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 467 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 468 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 469 \ 470 tmp0 = _mm_add_epi32(tmp0, rounding); \ 471 tmp1 = _mm_add_epi32(tmp1, rounding); \ 472 tmp2 = _mm_add_epi32(tmp2, rounding); \ 473 tmp3 = _mm_add_epi32(tmp3, rounding); \ 474 \ 475 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 476 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 477 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 478 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 479 \ 480 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 481 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 482 } \ 483 \ 484 /* Stage4 */ \ 485 in0 = _mm_adds_epi16(stp1_0, stp2_7); \ 486 in1 = _mm_adds_epi16(stp1_1, stp1_6); \ 487 in2 = _mm_adds_epi16(stp1_2, stp1_5); \ 488 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 489 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 490 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 491 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 492 in7 = _mm_subs_epi16(stp1_0, stp2_7); 493 494#define RECON_AND_STORE(dest, in_x) \ 495 { \ 496 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 497 d0 = _mm_unpacklo_epi8(d0, zero); \ 498 d0 = _mm_add_epi16(in_x, d0); \ 499 d0 = _mm_packus_epi16(d0, d0); \ 500 _mm_storel_epi64((__m128i *)(dest), d0); \ 501 dest += stride; \ 502 } 503 504void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 505 const __m128i zero = _mm_setzero_si128(); 506 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 507 const __m128i final_rounding = _mm_set1_epi16(1<<4); 508 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 509 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 510 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 511 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 512 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 513 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 514 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 515 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 516 517 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 518 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 519 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 520 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 521 int i; 522 523 // Load input data. 524 in0 = _mm_load_si128((const __m128i *)input); 525 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 526 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 527 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 528 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 529 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 530 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 531 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 532 533 // 2-D 534 for (i = 0; i < 2; i++) { 535 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 536 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 537 in4, in5, in6, in7); 538 539 // 4-stage 1D idct8x8 540 IDCT8_1D 541 } 542 543 // Final rounding and shift 544 in0 = _mm_adds_epi16(in0, final_rounding); 545 in1 = _mm_adds_epi16(in1, final_rounding); 546 in2 = _mm_adds_epi16(in2, final_rounding); 547 in3 = _mm_adds_epi16(in3, final_rounding); 548 in4 = _mm_adds_epi16(in4, final_rounding); 549 in5 = _mm_adds_epi16(in5, final_rounding); 550 in6 = _mm_adds_epi16(in6, final_rounding); 551 in7 = _mm_adds_epi16(in7, final_rounding); 552 553 in0 = _mm_srai_epi16(in0, 5); 554 in1 = _mm_srai_epi16(in1, 5); 555 in2 = _mm_srai_epi16(in2, 5); 556 in3 = _mm_srai_epi16(in3, 5); 557 in4 = _mm_srai_epi16(in4, 5); 558 in5 = _mm_srai_epi16(in5, 5); 559 in6 = _mm_srai_epi16(in6, 5); 560 in7 = _mm_srai_epi16(in7, 5); 561 562 RECON_AND_STORE(dest, in0); 563 RECON_AND_STORE(dest, in1); 564 RECON_AND_STORE(dest, in2); 565 RECON_AND_STORE(dest, in3); 566 RECON_AND_STORE(dest, in4); 567 RECON_AND_STORE(dest, in5); 568 RECON_AND_STORE(dest, in6); 569 RECON_AND_STORE(dest, in7); 570} 571 572void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 573 __m128i dc_value; 574 const __m128i zero = _mm_setzero_si128(); 575 int a; 576 577 a = dct_const_round_shift(input[0] * cospi_16_64); 578 a = dct_const_round_shift(a * cospi_16_64); 579 a = ROUND_POWER_OF_TWO(a, 5); 580 581 dc_value = _mm_set1_epi16(a); 582 583 RECON_AND_STORE(dest, dc_value); 584 RECON_AND_STORE(dest, dc_value); 585 RECON_AND_STORE(dest, dc_value); 586 RECON_AND_STORE(dest, dc_value); 587 RECON_AND_STORE(dest, dc_value); 588 RECON_AND_STORE(dest, dc_value); 589 RECON_AND_STORE(dest, dc_value); 590 RECON_AND_STORE(dest, dc_value); 591} 592 593// perform 8x8 transpose 594static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 595 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 596 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 597 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 598 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 599 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 600 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 601 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 602 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 603 604 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 605 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 606 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 607 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 608 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 609 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 610 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 611 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 612 613 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 614 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 615 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 616 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 617 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 618 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 619 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 620 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 621} 622 623static void idct8_1d_sse2(__m128i *in) { 624 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 625 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 626 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 627 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 628 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 629 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 630 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 631 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 632 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 633 634 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 635 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 636 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 638 639 in0 = in[0]; 640 in1 = in[1]; 641 in2 = in[2]; 642 in3 = in[3]; 643 in4 = in[4]; 644 in5 = in[5]; 645 in6 = in[6]; 646 in7 = in[7]; 647 648 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 649 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 650 in4, in5, in6, in7); 651 652 // 4-stage 1D idct8x8 653 IDCT8_1D 654 in[0] = in0; 655 in[1] = in1; 656 in[2] = in2; 657 in[3] = in3; 658 in[4] = in4; 659 in[5] = in5; 660 in[6] = in6; 661 in[7] = in7; 662} 663 664static void iadst8_1d_sse2(__m128i *in) { 665 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 666 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 667 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 668 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 669 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 670 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 671 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 672 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 673 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 674 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 675 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 676 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 677 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 678 const __m128i k__const_0 = _mm_set1_epi16(0); 679 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 680 681 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 682 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 683 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 684 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 685 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 686 687 // transpose 688 array_transpose_8x8(in, in); 689 690 // properly aligned for butterfly input 691 in0 = in[7]; 692 in1 = in[0]; 693 in2 = in[5]; 694 in3 = in[2]; 695 in4 = in[3]; 696 in5 = in[4]; 697 in6 = in[1]; 698 in7 = in[6]; 699 700 // column transformation 701 // stage 1 702 // interleave and multiply/add into 32-bit integer 703 s0 = _mm_unpacklo_epi16(in0, in1); 704 s1 = _mm_unpackhi_epi16(in0, in1); 705 s2 = _mm_unpacklo_epi16(in2, in3); 706 s3 = _mm_unpackhi_epi16(in2, in3); 707 s4 = _mm_unpacklo_epi16(in4, in5); 708 s5 = _mm_unpackhi_epi16(in4, in5); 709 s6 = _mm_unpacklo_epi16(in6, in7); 710 s7 = _mm_unpackhi_epi16(in6, in7); 711 712 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 713 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 714 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 715 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 716 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 717 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 718 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 719 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 720 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 721 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 722 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 723 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 724 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 725 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 726 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 727 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 728 729 // addition 730 w0 = _mm_add_epi32(u0, u8); 731 w1 = _mm_add_epi32(u1, u9); 732 w2 = _mm_add_epi32(u2, u10); 733 w3 = _mm_add_epi32(u3, u11); 734 w4 = _mm_add_epi32(u4, u12); 735 w5 = _mm_add_epi32(u5, u13); 736 w6 = _mm_add_epi32(u6, u14); 737 w7 = _mm_add_epi32(u7, u15); 738 w8 = _mm_sub_epi32(u0, u8); 739 w9 = _mm_sub_epi32(u1, u9); 740 w10 = _mm_sub_epi32(u2, u10); 741 w11 = _mm_sub_epi32(u3, u11); 742 w12 = _mm_sub_epi32(u4, u12); 743 w13 = _mm_sub_epi32(u5, u13); 744 w14 = _mm_sub_epi32(u6, u14); 745 w15 = _mm_sub_epi32(u7, u15); 746 747 // shift and rounding 748 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 749 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 750 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 751 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 752 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 753 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 754 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 755 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 756 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 757 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 758 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 759 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 760 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 761 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 762 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 763 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 764 765 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 766 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 767 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 768 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 769 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 770 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 771 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 772 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 773 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 774 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 775 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 776 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 777 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 778 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 779 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 780 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 781 782 // back to 16-bit and pack 8 integers into __m128i 783 in[0] = _mm_packs_epi32(u0, u1); 784 in[1] = _mm_packs_epi32(u2, u3); 785 in[2] = _mm_packs_epi32(u4, u5); 786 in[3] = _mm_packs_epi32(u6, u7); 787 in[4] = _mm_packs_epi32(u8, u9); 788 in[5] = _mm_packs_epi32(u10, u11); 789 in[6] = _mm_packs_epi32(u12, u13); 790 in[7] = _mm_packs_epi32(u14, u15); 791 792 // stage 2 793 s0 = _mm_add_epi16(in[0], in[2]); 794 s1 = _mm_add_epi16(in[1], in[3]); 795 s2 = _mm_sub_epi16(in[0], in[2]); 796 s3 = _mm_sub_epi16(in[1], in[3]); 797 u0 = _mm_unpacklo_epi16(in[4], in[5]); 798 u1 = _mm_unpackhi_epi16(in[4], in[5]); 799 u2 = _mm_unpacklo_epi16(in[6], in[7]); 800 u3 = _mm_unpackhi_epi16(in[6], in[7]); 801 802 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 803 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 804 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 805 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 806 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 807 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 808 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 809 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 810 811 w0 = _mm_add_epi32(v0, v4); 812 w1 = _mm_add_epi32(v1, v5); 813 w2 = _mm_add_epi32(v2, v6); 814 w3 = _mm_add_epi32(v3, v7); 815 w4 = _mm_sub_epi32(v0, v4); 816 w5 = _mm_sub_epi32(v1, v5); 817 w6 = _mm_sub_epi32(v2, v6); 818 w7 = _mm_sub_epi32(v3, v7); 819 820 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 821 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 822 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 823 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 824 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 825 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 826 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 827 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 828 829 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 830 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 831 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 832 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 833 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 834 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 835 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 836 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 837 838 // back to 16-bit intergers 839 s4 = _mm_packs_epi32(u0, u1); 840 s5 = _mm_packs_epi32(u2, u3); 841 s6 = _mm_packs_epi32(u4, u5); 842 s7 = _mm_packs_epi32(u6, u7); 843 844 // stage 3 845 u0 = _mm_unpacklo_epi16(s2, s3); 846 u1 = _mm_unpackhi_epi16(s2, s3); 847 u2 = _mm_unpacklo_epi16(s6, s7); 848 u3 = _mm_unpackhi_epi16(s6, s7); 849 850 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 851 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 852 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 853 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 854 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 855 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 856 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 857 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 858 859 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 860 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 861 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 862 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 863 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 864 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 865 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 866 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 867 868 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 869 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 870 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 871 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 872 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 873 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 874 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 875 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 876 877 s2 = _mm_packs_epi32(v0, v1); 878 s3 = _mm_packs_epi32(v2, v3); 879 s6 = _mm_packs_epi32(v4, v5); 880 s7 = _mm_packs_epi32(v6, v7); 881 882 in[0] = s0; 883 in[1] = _mm_sub_epi16(k__const_0, s4); 884 in[2] = s6; 885 in[3] = _mm_sub_epi16(k__const_0, s2); 886 in[4] = s3; 887 in[5] = _mm_sub_epi16(k__const_0, s7); 888 in[6] = s5; 889 in[7] = _mm_sub_epi16(k__const_0, s1); 890} 891 892 893void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 894 int tx_type) { 895 __m128i in[8]; 896 const __m128i zero = _mm_setzero_si128(); 897 const __m128i final_rounding = _mm_set1_epi16(1<<4); 898 899 // load input data 900 in[0] = _mm_load_si128((const __m128i *)input); 901 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 902 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 903 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 904 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 905 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 906 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 907 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 908 909 switch (tx_type) { 910 case 0: // DCT_DCT 911 idct8_1d_sse2(in); 912 idct8_1d_sse2(in); 913 break; 914 case 1: // ADST_DCT 915 idct8_1d_sse2(in); 916 iadst8_1d_sse2(in); 917 break; 918 case 2: // DCT_ADST 919 iadst8_1d_sse2(in); 920 idct8_1d_sse2(in); 921 break; 922 case 3: // ADST_ADST 923 iadst8_1d_sse2(in); 924 iadst8_1d_sse2(in); 925 break; 926 default: 927 assert(0); 928 break; 929 } 930 931 // Final rounding and shift 932 in[0] = _mm_adds_epi16(in[0], final_rounding); 933 in[1] = _mm_adds_epi16(in[1], final_rounding); 934 in[2] = _mm_adds_epi16(in[2], final_rounding); 935 in[3] = _mm_adds_epi16(in[3], final_rounding); 936 in[4] = _mm_adds_epi16(in[4], final_rounding); 937 in[5] = _mm_adds_epi16(in[5], final_rounding); 938 in[6] = _mm_adds_epi16(in[6], final_rounding); 939 in[7] = _mm_adds_epi16(in[7], final_rounding); 940 941 in[0] = _mm_srai_epi16(in[0], 5); 942 in[1] = _mm_srai_epi16(in[1], 5); 943 in[2] = _mm_srai_epi16(in[2], 5); 944 in[3] = _mm_srai_epi16(in[3], 5); 945 in[4] = _mm_srai_epi16(in[4], 5); 946 in[5] = _mm_srai_epi16(in[5], 5); 947 in[6] = _mm_srai_epi16(in[6], 5); 948 in[7] = _mm_srai_epi16(in[7], 5); 949 950 RECON_AND_STORE(dest, in[0]); 951 RECON_AND_STORE(dest, in[1]); 952 RECON_AND_STORE(dest, in[2]); 953 RECON_AND_STORE(dest, in[3]); 954 RECON_AND_STORE(dest, in[4]); 955 RECON_AND_STORE(dest, in[5]); 956 RECON_AND_STORE(dest, in[6]); 957 RECON_AND_STORE(dest, in[7]); 958} 959 960void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 961 const __m128i zero = _mm_setzero_si128(); 962 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 963 const __m128i final_rounding = _mm_set1_epi16(1<<4); 964 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 965 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 966 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 967 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 968 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 969 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 970 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 971 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 972 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 973 974 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 975 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 976 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 977 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 978 979 // Rows. Load 4-row input data. 980 in0 = _mm_load_si128((const __m128i *)input); 981 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 982 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 983 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 984 985 // 8x4 Transpose 986 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) 987 988 // Stage1 989 { //NOLINT 990 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); 991 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); 992 993 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 994 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 995 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 996 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 997 998 tmp0 = _mm_add_epi32(tmp0, rounding); 999 tmp2 = _mm_add_epi32(tmp2, rounding); 1000 tmp4 = _mm_add_epi32(tmp4, rounding); 1001 tmp6 = _mm_add_epi32(tmp6, rounding); 1002 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1003 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1004 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1005 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1006 1007 stp1_4 = _mm_packs_epi32(tmp0, zero); 1008 stp1_7 = _mm_packs_epi32(tmp2, zero); 1009 stp1_5 = _mm_packs_epi32(tmp4, zero); 1010 stp1_6 = _mm_packs_epi32(tmp6, zero); 1011 } 1012 1013 // Stage2 1014 { //NOLINT 1015 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); 1016 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); 1017 1018 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1019 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1020 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1021 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1022 1023 tmp0 = _mm_add_epi32(tmp0, rounding); 1024 tmp2 = _mm_add_epi32(tmp2, rounding); 1025 tmp4 = _mm_add_epi32(tmp4, rounding); 1026 tmp6 = _mm_add_epi32(tmp6, rounding); 1027 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1028 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1029 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1030 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1031 1032 stp2_0 = _mm_packs_epi32(tmp0, zero); 1033 stp2_1 = _mm_packs_epi32(tmp2, zero); 1034 stp2_2 = _mm_packs_epi32(tmp4, zero); 1035 stp2_3 = _mm_packs_epi32(tmp6, zero); 1036 1037 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); 1038 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); 1039 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); 1040 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); 1041 } 1042 1043 // Stage3 1044 { //NOLINT 1045 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1046 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); 1047 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); 1048 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); 1049 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); 1050 1051 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1052 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1053 1054 tmp0 = _mm_add_epi32(tmp0, rounding); 1055 tmp2 = _mm_add_epi32(tmp2, rounding); 1056 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1057 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1058 1059 stp1_5 = _mm_packs_epi32(tmp0, zero); 1060 stp1_6 = _mm_packs_epi32(tmp2, zero); 1061 } 1062 1063 // Stage4 1064 in0 = _mm_adds_epi16(stp1_0, stp2_7); 1065 in1 = _mm_adds_epi16(stp1_1, stp1_6); 1066 in2 = _mm_adds_epi16(stp1_2, stp1_5); 1067 in3 = _mm_adds_epi16(stp1_3, stp2_4); 1068 in4 = _mm_subs_epi16(stp1_3, stp2_4); 1069 in5 = _mm_subs_epi16(stp1_2, stp1_5); 1070 in6 = _mm_subs_epi16(stp1_1, stp1_6); 1071 in7 = _mm_subs_epi16(stp1_0, stp2_7); 1072 1073 // Columns. 4x8 Transpose 1074 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1075 in4, in5, in6, in7) 1076 1077 // 1D idct8x8 1078 IDCT8_1D 1079 1080 // Final rounding and shift 1081 in0 = _mm_adds_epi16(in0, final_rounding); 1082 in1 = _mm_adds_epi16(in1, final_rounding); 1083 in2 = _mm_adds_epi16(in2, final_rounding); 1084 in3 = _mm_adds_epi16(in3, final_rounding); 1085 in4 = _mm_adds_epi16(in4, final_rounding); 1086 in5 = _mm_adds_epi16(in5, final_rounding); 1087 in6 = _mm_adds_epi16(in6, final_rounding); 1088 in7 = _mm_adds_epi16(in7, final_rounding); 1089 1090 in0 = _mm_srai_epi16(in0, 5); 1091 in1 = _mm_srai_epi16(in1, 5); 1092 in2 = _mm_srai_epi16(in2, 5); 1093 in3 = _mm_srai_epi16(in3, 5); 1094 in4 = _mm_srai_epi16(in4, 5); 1095 in5 = _mm_srai_epi16(in5, 5); 1096 in6 = _mm_srai_epi16(in6, 5); 1097 in7 = _mm_srai_epi16(in7, 5); 1098 1099 RECON_AND_STORE(dest, in0); 1100 RECON_AND_STORE(dest, in1); 1101 RECON_AND_STORE(dest, in2); 1102 RECON_AND_STORE(dest, in3); 1103 RECON_AND_STORE(dest, in4); 1104 RECON_AND_STORE(dest, in5); 1105 RECON_AND_STORE(dest, in6); 1106 RECON_AND_STORE(dest, in7); 1107} 1108 1109#define IDCT16_1D \ 1110 /* Stage2 */ \ 1111 { \ 1112 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 1113 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 1114 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 1115 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 1116 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 1117 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 1118 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 1119 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 1120 \ 1121 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1122 stg2_0, stg2_1, stg2_2, stg2_3, \ 1123 stp2_8, stp2_15, stp2_9, stp2_14) \ 1124 \ 1125 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1126 stg2_4, stg2_5, stg2_6, stg2_7, \ 1127 stp2_10, stp2_13, stp2_11, stp2_12) \ 1128 } \ 1129 \ 1130 /* Stage3 */ \ 1131 { \ 1132 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 1133 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 1134 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 1135 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 1136 \ 1137 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1138 stg3_0, stg3_1, stg3_2, stg3_3, \ 1139 stp1_4, stp1_7, stp1_5, stp1_6) \ 1140 \ 1141 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1142 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1143 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1144 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1145 \ 1146 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1147 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1148 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1149 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1150 } \ 1151 \ 1152 /* Stage4 */ \ 1153 { \ 1154 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 1155 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 1156 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 1157 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 1158 \ 1159 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1160 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1161 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1162 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1163 \ 1164 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1165 stg4_0, stg4_1, stg4_2, stg4_3, \ 1166 stp2_0, stp2_1, stp2_2, stp2_3) \ 1167 \ 1168 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1169 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1170 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1171 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1172 \ 1173 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1174 stg4_4, stg4_5, stg4_6, stg4_7, \ 1175 stp2_9, stp2_14, stp2_10, stp2_13) \ 1176 } \ 1177 \ 1178 /* Stage5 */ \ 1179 { \ 1180 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1181 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1182 \ 1183 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1184 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1185 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1186 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1187 \ 1188 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1189 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1190 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1191 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1192 \ 1193 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1194 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1195 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1196 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1197 \ 1198 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1199 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1200 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1201 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1202 \ 1203 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1204 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1205 \ 1206 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1207 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1208 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1209 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1210 \ 1211 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1212 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1213 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1214 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1215 } \ 1216 \ 1217 /* Stage6 */ \ 1218 { \ 1219 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1220 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1221 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1222 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1223 \ 1224 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1225 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1226 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1227 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1228 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1229 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1230 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1231 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1232 \ 1233 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1234 stg6_0, stg4_0, stg6_0, stg4_0, \ 1235 stp2_10, stp2_13, stp2_11, stp2_12) \ 1236 } 1237 1238void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1239 int stride) { 1240 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1241 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1242 const __m128i zero = _mm_setzero_si128(); 1243 1244 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1245 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1246 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1247 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1248 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1249 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1250 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1251 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1252 1253 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1254 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1255 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1256 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1257 1258 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1259 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1260 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1261 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1262 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1263 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1264 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1265 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1266 1267 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1268 1269 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1270 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1271 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1272 in14 = zero, in15 = zero; 1273 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1274 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1275 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1276 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, 1277 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, 1278 r12 = zero, r13 = zero, r14 = zero, r15 = zero; 1279 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1280 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1281 stp1_8_0, stp1_12_0; 1282 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1283 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1284 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1285 int i; 1286 1287 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 1288 for (i = 0; i < 4; i++) { 1289 // 1-D idct 1290 if (i < 2) { 1291 if (i == 1) input += 128; 1292 1293 // Load input data. 1294 in0 = _mm_load_si128((const __m128i *)input); 1295 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1296 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1297 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1298 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1299 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1300 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1301 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1302 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1303 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1304 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1305 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1306 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1307 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1308 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1309 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1310 1311 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1312 in4, in5, in6, in7); 1313 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1314 in10, in11, in12, in13, in14, in15); 1315 } 1316 1317 if (i == 2) { 1318 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1319 in5, in6, in7); 1320 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, 1321 in13, in14, in15); 1322 } 1323 1324 if (i == 3) { 1325 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1326 in4, in5, in6, in7); 1327 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 1328 in12, in13, in14, in15); 1329 } 1330 1331 IDCT16_1D 1332 1333 // Stage7 1334 if (i == 0) { 1335 // Left 8x16 1336 l0 = _mm_add_epi16(stp2_0, stp1_15); 1337 l1 = _mm_add_epi16(stp2_1, stp1_14); 1338 l2 = _mm_add_epi16(stp2_2, stp2_13); 1339 l3 = _mm_add_epi16(stp2_3, stp2_12); 1340 l4 = _mm_add_epi16(stp2_4, stp2_11); 1341 l5 = _mm_add_epi16(stp2_5, stp2_10); 1342 l6 = _mm_add_epi16(stp2_6, stp1_9); 1343 l7 = _mm_add_epi16(stp2_7, stp1_8); 1344 l8 = _mm_sub_epi16(stp2_7, stp1_8); 1345 l9 = _mm_sub_epi16(stp2_6, stp1_9); 1346 l10 = _mm_sub_epi16(stp2_5, stp2_10); 1347 l11 = _mm_sub_epi16(stp2_4, stp2_11); 1348 l12 = _mm_sub_epi16(stp2_3, stp2_12); 1349 l13 = _mm_sub_epi16(stp2_2, stp2_13); 1350 l14 = _mm_sub_epi16(stp2_1, stp1_14); 1351 l15 = _mm_sub_epi16(stp2_0, stp1_15); 1352 } else if (i == 1) { 1353 // Right 8x16 1354 r0 = _mm_add_epi16(stp2_0, stp1_15); 1355 r1 = _mm_add_epi16(stp2_1, stp1_14); 1356 r2 = _mm_add_epi16(stp2_2, stp2_13); 1357 r3 = _mm_add_epi16(stp2_3, stp2_12); 1358 r4 = _mm_add_epi16(stp2_4, stp2_11); 1359 r5 = _mm_add_epi16(stp2_5, stp2_10); 1360 r6 = _mm_add_epi16(stp2_6, stp1_9); 1361 r7 = _mm_add_epi16(stp2_7, stp1_8); 1362 r8 = _mm_sub_epi16(stp2_7, stp1_8); 1363 r9 = _mm_sub_epi16(stp2_6, stp1_9); 1364 r10 = _mm_sub_epi16(stp2_5, stp2_10); 1365 r11 = _mm_sub_epi16(stp2_4, stp2_11); 1366 r12 = _mm_sub_epi16(stp2_3, stp2_12); 1367 r13 = _mm_sub_epi16(stp2_2, stp2_13); 1368 r14 = _mm_sub_epi16(stp2_1, stp1_14); 1369 r15 = _mm_sub_epi16(stp2_0, stp1_15); 1370 } else { 1371 // 2-D 1372 in0 = _mm_add_epi16(stp2_0, stp1_15); 1373 in1 = _mm_add_epi16(stp2_1, stp1_14); 1374 in2 = _mm_add_epi16(stp2_2, stp2_13); 1375 in3 = _mm_add_epi16(stp2_3, stp2_12); 1376 in4 = _mm_add_epi16(stp2_4, stp2_11); 1377 in5 = _mm_add_epi16(stp2_5, stp2_10); 1378 in6 = _mm_add_epi16(stp2_6, stp1_9); 1379 in7 = _mm_add_epi16(stp2_7, stp1_8); 1380 in8 = _mm_sub_epi16(stp2_7, stp1_8); 1381 in9 = _mm_sub_epi16(stp2_6, stp1_9); 1382 in10 = _mm_sub_epi16(stp2_5, stp2_10); 1383 in11 = _mm_sub_epi16(stp2_4, stp2_11); 1384 in12 = _mm_sub_epi16(stp2_3, stp2_12); 1385 in13 = _mm_sub_epi16(stp2_2, stp2_13); 1386 in14 = _mm_sub_epi16(stp2_1, stp1_14); 1387 in15 = _mm_sub_epi16(stp2_0, stp1_15); 1388 1389 // Final rounding and shift 1390 in0 = _mm_adds_epi16(in0, final_rounding); 1391 in1 = _mm_adds_epi16(in1, final_rounding); 1392 in2 = _mm_adds_epi16(in2, final_rounding); 1393 in3 = _mm_adds_epi16(in3, final_rounding); 1394 in4 = _mm_adds_epi16(in4, final_rounding); 1395 in5 = _mm_adds_epi16(in5, final_rounding); 1396 in6 = _mm_adds_epi16(in6, final_rounding); 1397 in7 = _mm_adds_epi16(in7, final_rounding); 1398 in8 = _mm_adds_epi16(in8, final_rounding); 1399 in9 = _mm_adds_epi16(in9, final_rounding); 1400 in10 = _mm_adds_epi16(in10, final_rounding); 1401 in11 = _mm_adds_epi16(in11, final_rounding); 1402 in12 = _mm_adds_epi16(in12, final_rounding); 1403 in13 = _mm_adds_epi16(in13, final_rounding); 1404 in14 = _mm_adds_epi16(in14, final_rounding); 1405 in15 = _mm_adds_epi16(in15, final_rounding); 1406 1407 in0 = _mm_srai_epi16(in0, 6); 1408 in1 = _mm_srai_epi16(in1, 6); 1409 in2 = _mm_srai_epi16(in2, 6); 1410 in3 = _mm_srai_epi16(in3, 6); 1411 in4 = _mm_srai_epi16(in4, 6); 1412 in5 = _mm_srai_epi16(in5, 6); 1413 in6 = _mm_srai_epi16(in6, 6); 1414 in7 = _mm_srai_epi16(in7, 6); 1415 in8 = _mm_srai_epi16(in8, 6); 1416 in9 = _mm_srai_epi16(in9, 6); 1417 in10 = _mm_srai_epi16(in10, 6); 1418 in11 = _mm_srai_epi16(in11, 6); 1419 in12 = _mm_srai_epi16(in12, 6); 1420 in13 = _mm_srai_epi16(in13, 6); 1421 in14 = _mm_srai_epi16(in14, 6); 1422 in15 = _mm_srai_epi16(in15, 6); 1423 1424 RECON_AND_STORE(dest, in0); 1425 RECON_AND_STORE(dest, in1); 1426 RECON_AND_STORE(dest, in2); 1427 RECON_AND_STORE(dest, in3); 1428 RECON_AND_STORE(dest, in4); 1429 RECON_AND_STORE(dest, in5); 1430 RECON_AND_STORE(dest, in6); 1431 RECON_AND_STORE(dest, in7); 1432 RECON_AND_STORE(dest, in8); 1433 RECON_AND_STORE(dest, in9); 1434 RECON_AND_STORE(dest, in10); 1435 RECON_AND_STORE(dest, in11); 1436 RECON_AND_STORE(dest, in12); 1437 RECON_AND_STORE(dest, in13); 1438 RECON_AND_STORE(dest, in14); 1439 RECON_AND_STORE(dest, in15); 1440 1441 dest += 8 - (stride * 16); 1442 } 1443 } 1444} 1445 1446void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1447 __m128i dc_value; 1448 const __m128i zero = _mm_setzero_si128(); 1449 int a, i; 1450 1451 a = dct_const_round_shift(input[0] * cospi_16_64); 1452 a = dct_const_round_shift(a * cospi_16_64); 1453 a = ROUND_POWER_OF_TWO(a, 6); 1454 1455 dc_value = _mm_set1_epi16(a); 1456 1457 for (i = 0; i < 2; ++i) { 1458 RECON_AND_STORE(dest, dc_value); 1459 RECON_AND_STORE(dest, dc_value); 1460 RECON_AND_STORE(dest, dc_value); 1461 RECON_AND_STORE(dest, dc_value); 1462 RECON_AND_STORE(dest, dc_value); 1463 RECON_AND_STORE(dest, dc_value); 1464 RECON_AND_STORE(dest, dc_value); 1465 RECON_AND_STORE(dest, dc_value); 1466 RECON_AND_STORE(dest, dc_value); 1467 RECON_AND_STORE(dest, dc_value); 1468 RECON_AND_STORE(dest, dc_value); 1469 RECON_AND_STORE(dest, dc_value); 1470 RECON_AND_STORE(dest, dc_value); 1471 RECON_AND_STORE(dest, dc_value); 1472 RECON_AND_STORE(dest, dc_value); 1473 RECON_AND_STORE(dest, dc_value); 1474 dest += 8 - (stride * 16); 1475 } 1476} 1477 1478static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1479 __m128i tbuf[8]; 1480 array_transpose_8x8(res0, res0); 1481 array_transpose_8x8(res1, tbuf); 1482 array_transpose_8x8(res0 + 8, res1); 1483 array_transpose_8x8(res1 + 8, res1 + 8); 1484 1485 res0[8] = tbuf[0]; 1486 res0[9] = tbuf[1]; 1487 res0[10] = tbuf[2]; 1488 res0[11] = tbuf[3]; 1489 res0[12] = tbuf[4]; 1490 res0[13] = tbuf[5]; 1491 res0[14] = tbuf[6]; 1492 res0[15] = tbuf[7]; 1493} 1494 1495static void iadst16_1d_8col(__m128i *in) { 1496 // perform 16x16 1-D ADST for 8 columns 1497 __m128i s[16], x[16], u[32], v[32]; 1498 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1499 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1500 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1501 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1502 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1503 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1504 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1505 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1506 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1507 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1508 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1509 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1510 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1511 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1512 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1513 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1514 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1515 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1516 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1517 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1518 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1519 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1520 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1521 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1522 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1523 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1524 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1525 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1526 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1527 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1528 const __m128i kZero = _mm_set1_epi16(0); 1529 1530 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1531 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1532 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1533 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1534 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1535 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1536 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1537 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1538 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1539 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1540 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1541 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1542 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1543 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1544 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1545 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1546 1547 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1548 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1549 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1550 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1551 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1552 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1553 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1554 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1555 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1556 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1557 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1558 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1559 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1560 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1561 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1562 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1563 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1564 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1565 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1566 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1567 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1568 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1569 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1570 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1571 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1572 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1573 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1574 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1575 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1576 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1577 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1578 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1579 1580 u[0] = _mm_add_epi32(v[0], v[16]); 1581 u[1] = _mm_add_epi32(v[1], v[17]); 1582 u[2] = _mm_add_epi32(v[2], v[18]); 1583 u[3] = _mm_add_epi32(v[3], v[19]); 1584 u[4] = _mm_add_epi32(v[4], v[20]); 1585 u[5] = _mm_add_epi32(v[5], v[21]); 1586 u[6] = _mm_add_epi32(v[6], v[22]); 1587 u[7] = _mm_add_epi32(v[7], v[23]); 1588 u[8] = _mm_add_epi32(v[8], v[24]); 1589 u[9] = _mm_add_epi32(v[9], v[25]); 1590 u[10] = _mm_add_epi32(v[10], v[26]); 1591 u[11] = _mm_add_epi32(v[11], v[27]); 1592 u[12] = _mm_add_epi32(v[12], v[28]); 1593 u[13] = _mm_add_epi32(v[13], v[29]); 1594 u[14] = _mm_add_epi32(v[14], v[30]); 1595 u[15] = _mm_add_epi32(v[15], v[31]); 1596 u[16] = _mm_sub_epi32(v[0], v[16]); 1597 u[17] = _mm_sub_epi32(v[1], v[17]); 1598 u[18] = _mm_sub_epi32(v[2], v[18]); 1599 u[19] = _mm_sub_epi32(v[3], v[19]); 1600 u[20] = _mm_sub_epi32(v[4], v[20]); 1601 u[21] = _mm_sub_epi32(v[5], v[21]); 1602 u[22] = _mm_sub_epi32(v[6], v[22]); 1603 u[23] = _mm_sub_epi32(v[7], v[23]); 1604 u[24] = _mm_sub_epi32(v[8], v[24]); 1605 u[25] = _mm_sub_epi32(v[9], v[25]); 1606 u[26] = _mm_sub_epi32(v[10], v[26]); 1607 u[27] = _mm_sub_epi32(v[11], v[27]); 1608 u[28] = _mm_sub_epi32(v[12], v[28]); 1609 u[29] = _mm_sub_epi32(v[13], v[29]); 1610 u[30] = _mm_sub_epi32(v[14], v[30]); 1611 u[31] = _mm_sub_epi32(v[15], v[31]); 1612 1613 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1614 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1615 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1616 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1617 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1618 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1619 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1620 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1621 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1622 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1623 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1624 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1625 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1626 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1627 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1628 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1629 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1630 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1631 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1632 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1633 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1634 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1635 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1636 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1637 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1638 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1639 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1640 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1641 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1642 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1643 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1644 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1645 1646 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1647 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1648 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1649 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1650 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1651 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1652 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1653 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1654 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1655 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1656 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1657 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1658 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1659 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1660 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1661 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1662 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1663 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1664 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1665 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1666 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1667 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1668 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1669 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1670 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1671 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1672 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1673 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1674 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1675 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1676 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1677 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1678 1679 s[0] = _mm_packs_epi32(u[0], u[1]); 1680 s[1] = _mm_packs_epi32(u[2], u[3]); 1681 s[2] = _mm_packs_epi32(u[4], u[5]); 1682 s[3] = _mm_packs_epi32(u[6], u[7]); 1683 s[4] = _mm_packs_epi32(u[8], u[9]); 1684 s[5] = _mm_packs_epi32(u[10], u[11]); 1685 s[6] = _mm_packs_epi32(u[12], u[13]); 1686 s[7] = _mm_packs_epi32(u[14], u[15]); 1687 s[8] = _mm_packs_epi32(u[16], u[17]); 1688 s[9] = _mm_packs_epi32(u[18], u[19]); 1689 s[10] = _mm_packs_epi32(u[20], u[21]); 1690 s[11] = _mm_packs_epi32(u[22], u[23]); 1691 s[12] = _mm_packs_epi32(u[24], u[25]); 1692 s[13] = _mm_packs_epi32(u[26], u[27]); 1693 s[14] = _mm_packs_epi32(u[28], u[29]); 1694 s[15] = _mm_packs_epi32(u[30], u[31]); 1695 1696 // stage 2 1697 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1698 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1699 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1700 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1701 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1702 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1703 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1704 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1705 1706 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1707 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1708 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1709 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1710 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1711 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1712 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1713 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1714 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1715 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1716 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1717 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1718 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1719 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1720 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1721 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1722 1723 u[0] = _mm_add_epi32(v[0], v[8]); 1724 u[1] = _mm_add_epi32(v[1], v[9]); 1725 u[2] = _mm_add_epi32(v[2], v[10]); 1726 u[3] = _mm_add_epi32(v[3], v[11]); 1727 u[4] = _mm_add_epi32(v[4], v[12]); 1728 u[5] = _mm_add_epi32(v[5], v[13]); 1729 u[6] = _mm_add_epi32(v[6], v[14]); 1730 u[7] = _mm_add_epi32(v[7], v[15]); 1731 u[8] = _mm_sub_epi32(v[0], v[8]); 1732 u[9] = _mm_sub_epi32(v[1], v[9]); 1733 u[10] = _mm_sub_epi32(v[2], v[10]); 1734 u[11] = _mm_sub_epi32(v[3], v[11]); 1735 u[12] = _mm_sub_epi32(v[4], v[12]); 1736 u[13] = _mm_sub_epi32(v[5], v[13]); 1737 u[14] = _mm_sub_epi32(v[6], v[14]); 1738 u[15] = _mm_sub_epi32(v[7], v[15]); 1739 1740 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1741 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1742 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1743 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1744 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1745 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1746 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1747 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1748 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1749 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1750 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1751 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1752 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1753 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1754 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1755 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1756 1757 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1758 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1759 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1760 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1761 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1762 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1763 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1764 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1765 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1766 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1767 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1768 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1769 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1770 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1771 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1772 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1773 1774 x[0] = _mm_add_epi16(s[0], s[4]); 1775 x[1] = _mm_add_epi16(s[1], s[5]); 1776 x[2] = _mm_add_epi16(s[2], s[6]); 1777 x[3] = _mm_add_epi16(s[3], s[7]); 1778 x[4] = _mm_sub_epi16(s[0], s[4]); 1779 x[5] = _mm_sub_epi16(s[1], s[5]); 1780 x[6] = _mm_sub_epi16(s[2], s[6]); 1781 x[7] = _mm_sub_epi16(s[3], s[7]); 1782 x[8] = _mm_packs_epi32(u[0], u[1]); 1783 x[9] = _mm_packs_epi32(u[2], u[3]); 1784 x[10] = _mm_packs_epi32(u[4], u[5]); 1785 x[11] = _mm_packs_epi32(u[6], u[7]); 1786 x[12] = _mm_packs_epi32(u[8], u[9]); 1787 x[13] = _mm_packs_epi32(u[10], u[11]); 1788 x[14] = _mm_packs_epi32(u[12], u[13]); 1789 x[15] = _mm_packs_epi32(u[14], u[15]); 1790 1791 // stage 3 1792 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1793 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1794 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1795 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1796 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1797 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1798 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1799 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1800 1801 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1802 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1803 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1804 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1805 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1806 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1807 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1808 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1809 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1810 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1811 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1812 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1813 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1814 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1815 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1816 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1817 1818 u[0] = _mm_add_epi32(v[0], v[4]); 1819 u[1] = _mm_add_epi32(v[1], v[5]); 1820 u[2] = _mm_add_epi32(v[2], v[6]); 1821 u[3] = _mm_add_epi32(v[3], v[7]); 1822 u[4] = _mm_sub_epi32(v[0], v[4]); 1823 u[5] = _mm_sub_epi32(v[1], v[5]); 1824 u[6] = _mm_sub_epi32(v[2], v[6]); 1825 u[7] = _mm_sub_epi32(v[3], v[7]); 1826 u[8] = _mm_add_epi32(v[8], v[12]); 1827 u[9] = _mm_add_epi32(v[9], v[13]); 1828 u[10] = _mm_add_epi32(v[10], v[14]); 1829 u[11] = _mm_add_epi32(v[11], v[15]); 1830 u[12] = _mm_sub_epi32(v[8], v[12]); 1831 u[13] = _mm_sub_epi32(v[9], v[13]); 1832 u[14] = _mm_sub_epi32(v[10], v[14]); 1833 u[15] = _mm_sub_epi32(v[11], v[15]); 1834 1835 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1836 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1837 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1838 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1839 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1840 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1841 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1842 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1843 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1844 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1845 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1846 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1847 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1848 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1849 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1850 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1851 1852 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1853 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1854 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1855 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1856 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1857 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1858 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1859 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1860 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1861 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1862 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1863 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1864 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1865 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1866 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1867 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1868 1869 s[0] = _mm_add_epi16(x[0], x[2]); 1870 s[1] = _mm_add_epi16(x[1], x[3]); 1871 s[2] = _mm_sub_epi16(x[0], x[2]); 1872 s[3] = _mm_sub_epi16(x[1], x[3]); 1873 s[4] = _mm_packs_epi32(v[0], v[1]); 1874 s[5] = _mm_packs_epi32(v[2], v[3]); 1875 s[6] = _mm_packs_epi32(v[4], v[5]); 1876 s[7] = _mm_packs_epi32(v[6], v[7]); 1877 s[8] = _mm_add_epi16(x[8], x[10]); 1878 s[9] = _mm_add_epi16(x[9], x[11]); 1879 s[10] = _mm_sub_epi16(x[8], x[10]); 1880 s[11] = _mm_sub_epi16(x[9], x[11]); 1881 s[12] = _mm_packs_epi32(v[8], v[9]); 1882 s[13] = _mm_packs_epi32(v[10], v[11]); 1883 s[14] = _mm_packs_epi32(v[12], v[13]); 1884 s[15] = _mm_packs_epi32(v[14], v[15]); 1885 1886 // stage 4 1887 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1888 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1889 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1890 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1891 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1892 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1893 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1894 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1895 1896 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1897 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1898 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1899 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1900 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1901 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1902 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1903 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1904 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1905 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1906 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1907 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1908 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1909 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1910 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1911 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1912 1913 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1914 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1915 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1916 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1917 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1918 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1919 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1920 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1921 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1922 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1923 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1924 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1925 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1926 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1927 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1928 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1929 1930 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1931 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1932 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1933 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1934 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1935 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1936 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1937 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1938 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1939 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1940 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1941 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1942 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1943 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1944 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1945 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1946 1947 in[0] = s[0]; 1948 in[1] = _mm_sub_epi16(kZero, s[8]); 1949 in[2] = s[12]; 1950 in[3] = _mm_sub_epi16(kZero, s[4]); 1951 in[4] = _mm_packs_epi32(v[4], v[5]); 1952 in[5] = _mm_packs_epi32(v[12], v[13]); 1953 in[6] = _mm_packs_epi32(v[8], v[9]); 1954 in[7] = _mm_packs_epi32(v[0], v[1]); 1955 in[8] = _mm_packs_epi32(v[2], v[3]); 1956 in[9] = _mm_packs_epi32(v[10], v[11]); 1957 in[10] = _mm_packs_epi32(v[14], v[15]); 1958 in[11] = _mm_packs_epi32(v[6], v[7]); 1959 in[12] = s[5]; 1960 in[13] = _mm_sub_epi16(kZero, s[13]); 1961 in[14] = s[9]; 1962 in[15] = _mm_sub_epi16(kZero, s[1]); 1963} 1964 1965static void idct16_1d_8col(__m128i *in) { 1966 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1967 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1968 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1969 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1970 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1971 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1972 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1973 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1974 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1975 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1976 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1977 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1978 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1979 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1980 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1981 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1982 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1983 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1984 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1985 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1986 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1987 __m128i v[16], u[16], s[16], t[16]; 1988 1989 // stage 1 1990 s[0] = in[0]; 1991 s[1] = in[8]; 1992 s[2] = in[4]; 1993 s[3] = in[12]; 1994 s[4] = in[2]; 1995 s[5] = in[10]; 1996 s[6] = in[6]; 1997 s[7] = in[14]; 1998 s[8] = in[1]; 1999 s[9] = in[9]; 2000 s[10] = in[5]; 2001 s[11] = in[13]; 2002 s[12] = in[3]; 2003 s[13] = in[11]; 2004 s[14] = in[7]; 2005 s[15] = in[15]; 2006 2007 // stage 2 2008 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 2009 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 2010 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 2011 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 2012 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 2013 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 2014 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 2015 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 2016 2017 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 2018 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 2019 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 2020 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 2021 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 2022 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 2023 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 2024 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 2025 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 2026 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 2027 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 2028 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 2029 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 2030 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 2031 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 2032 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 2033 2034 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2035 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2036 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2037 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2038 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2039 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2040 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2041 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2042 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2043 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2044 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2045 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2046 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2047 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2048 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2049 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2050 2051 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2052 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2053 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2054 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2055 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2056 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2057 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2058 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2059 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2060 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2061 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2062 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2063 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2064 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2065 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2066 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2067 2068 s[8] = _mm_packs_epi32(u[0], u[1]); 2069 s[15] = _mm_packs_epi32(u[2], u[3]); 2070 s[9] = _mm_packs_epi32(u[4], u[5]); 2071 s[14] = _mm_packs_epi32(u[6], u[7]); 2072 s[10] = _mm_packs_epi32(u[8], u[9]); 2073 s[13] = _mm_packs_epi32(u[10], u[11]); 2074 s[11] = _mm_packs_epi32(u[12], u[13]); 2075 s[12] = _mm_packs_epi32(u[14], u[15]); 2076 2077 // stage 3 2078 t[0] = s[0]; 2079 t[1] = s[1]; 2080 t[2] = s[2]; 2081 t[3] = s[3]; 2082 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 2083 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 2084 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 2085 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 2086 2087 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2088 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2089 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2090 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2091 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2092 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2093 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2094 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2095 2096 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2097 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2098 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2099 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2100 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2101 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2102 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2103 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2104 2105 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2106 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2107 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2108 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2109 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2110 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2111 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2112 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2113 2114 t[4] = _mm_packs_epi32(u[0], u[1]); 2115 t[7] = _mm_packs_epi32(u[2], u[3]); 2116 t[5] = _mm_packs_epi32(u[4], u[5]); 2117 t[6] = _mm_packs_epi32(u[6], u[7]); 2118 t[8] = _mm_add_epi16(s[8], s[9]); 2119 t[9] = _mm_sub_epi16(s[8], s[9]); 2120 t[10] = _mm_sub_epi16(s[11], s[10]); 2121 t[11] = _mm_add_epi16(s[10], s[11]); 2122 t[12] = _mm_add_epi16(s[12], s[13]); 2123 t[13] = _mm_sub_epi16(s[12], s[13]); 2124 t[14] = _mm_sub_epi16(s[15], s[14]); 2125 t[15] = _mm_add_epi16(s[14], s[15]); 2126 2127 // stage 4 2128 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 2129 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 2130 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 2131 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 2132 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 2133 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 2134 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 2135 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 2136 2137 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2138 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2139 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2140 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2141 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 2142 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 2143 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2144 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2145 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 2146 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 2147 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 2148 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 2149 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 2150 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 2151 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 2152 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 2153 2154 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2155 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2156 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2157 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2158 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2159 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2160 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2161 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2162 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2163 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2164 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2165 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2166 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2167 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2168 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2169 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2170 2171 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2172 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2173 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2174 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2175 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2176 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2177 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2178 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2179 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2180 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2181 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2182 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2183 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2184 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2185 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2186 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2187 2188 s[0] = _mm_packs_epi32(u[0], u[1]); 2189 s[1] = _mm_packs_epi32(u[2], u[3]); 2190 s[2] = _mm_packs_epi32(u[4], u[5]); 2191 s[3] = _mm_packs_epi32(u[6], u[7]); 2192 s[4] = _mm_add_epi16(t[4], t[5]); 2193 s[5] = _mm_sub_epi16(t[4], t[5]); 2194 s[6] = _mm_sub_epi16(t[7], t[6]); 2195 s[7] = _mm_add_epi16(t[6], t[7]); 2196 s[8] = t[8]; 2197 s[15] = t[15]; 2198 s[9] = _mm_packs_epi32(u[8], u[9]); 2199 s[14] = _mm_packs_epi32(u[10], u[11]); 2200 s[10] = _mm_packs_epi32(u[12], u[13]); 2201 s[13] = _mm_packs_epi32(u[14], u[15]); 2202 s[11] = t[11]; 2203 s[12] = t[12]; 2204 2205 // stage 5 2206 t[0] = _mm_add_epi16(s[0], s[3]); 2207 t[1] = _mm_add_epi16(s[1], s[2]); 2208 t[2] = _mm_sub_epi16(s[1], s[2]); 2209 t[3] = _mm_sub_epi16(s[0], s[3]); 2210 t[4] = s[4]; 2211 t[7] = s[7]; 2212 2213 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2214 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2215 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2216 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2217 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2218 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2219 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2220 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2221 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2222 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2223 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2224 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2225 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2226 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2227 t[5] = _mm_packs_epi32(u[0], u[1]); 2228 t[6] = _mm_packs_epi32(u[2], u[3]); 2229 2230 t[8] = _mm_add_epi16(s[8], s[11]); 2231 t[9] = _mm_add_epi16(s[9], s[10]); 2232 t[10] = _mm_sub_epi16(s[9], s[10]); 2233 t[11] = _mm_sub_epi16(s[8], s[11]); 2234 t[12] = _mm_sub_epi16(s[15], s[12]); 2235 t[13] = _mm_sub_epi16(s[14], s[13]); 2236 t[14] = _mm_add_epi16(s[13], s[14]); 2237 t[15] = _mm_add_epi16(s[12], s[15]); 2238 2239 // stage 6 2240 s[0] = _mm_add_epi16(t[0], t[7]); 2241 s[1] = _mm_add_epi16(t[1], t[6]); 2242 s[2] = _mm_add_epi16(t[2], t[5]); 2243 s[3] = _mm_add_epi16(t[3], t[4]); 2244 s[4] = _mm_sub_epi16(t[3], t[4]); 2245 s[5] = _mm_sub_epi16(t[2], t[5]); 2246 s[6] = _mm_sub_epi16(t[1], t[6]); 2247 s[7] = _mm_sub_epi16(t[0], t[7]); 2248 s[8] = t[8]; 2249 s[9] = t[9]; 2250 2251 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2252 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2253 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2254 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2255 2256 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2257 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2258 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2259 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2260 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2261 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2262 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2263 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2264 2265 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2266 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2267 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2268 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2269 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2270 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2271 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2272 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2273 2274 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2275 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2276 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2277 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2278 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2279 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2280 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2281 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2282 2283 s[10] = _mm_packs_epi32(u[0], u[1]); 2284 s[13] = _mm_packs_epi32(u[2], u[3]); 2285 s[11] = _mm_packs_epi32(u[4], u[5]); 2286 s[12] = _mm_packs_epi32(u[6], u[7]); 2287 s[14] = t[14]; 2288 s[15] = t[15]; 2289 2290 // stage 7 2291 in[0] = _mm_add_epi16(s[0], s[15]); 2292 in[1] = _mm_add_epi16(s[1], s[14]); 2293 in[2] = _mm_add_epi16(s[2], s[13]); 2294 in[3] = _mm_add_epi16(s[3], s[12]); 2295 in[4] = _mm_add_epi16(s[4], s[11]); 2296 in[5] = _mm_add_epi16(s[5], s[10]); 2297 in[6] = _mm_add_epi16(s[6], s[9]); 2298 in[7] = _mm_add_epi16(s[7], s[8]); 2299 in[8] = _mm_sub_epi16(s[7], s[8]); 2300 in[9] = _mm_sub_epi16(s[6], s[9]); 2301 in[10] = _mm_sub_epi16(s[5], s[10]); 2302 in[11] = _mm_sub_epi16(s[4], s[11]); 2303 in[12] = _mm_sub_epi16(s[3], s[12]); 2304 in[13] = _mm_sub_epi16(s[2], s[13]); 2305 in[14] = _mm_sub_epi16(s[1], s[14]); 2306 in[15] = _mm_sub_epi16(s[0], s[15]); 2307} 2308 2309static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { 2310 array_transpose_16x16(in0, in1); 2311 idct16_1d_8col(in0); 2312 idct16_1d_8col(in1); 2313} 2314 2315static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { 2316 array_transpose_16x16(in0, in1); 2317 iadst16_1d_8col(in0); 2318 iadst16_1d_8col(in1); 2319} 2320 2321static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 2322 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 2323 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 2324 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 2325 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 2326 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 2327 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 2328 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 2329 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 2330 2331 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 2332 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 2333 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 2334 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 2335 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 2336 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 2337 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 2338 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 2339} 2340 2341static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 2342 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2343 const __m128i zero = _mm_setzero_si128(); 2344 // Final rounding and shift 2345 in[0] = _mm_adds_epi16(in[0], final_rounding); 2346 in[1] = _mm_adds_epi16(in[1], final_rounding); 2347 in[2] = _mm_adds_epi16(in[2], final_rounding); 2348 in[3] = _mm_adds_epi16(in[3], final_rounding); 2349 in[4] = _mm_adds_epi16(in[4], final_rounding); 2350 in[5] = _mm_adds_epi16(in[5], final_rounding); 2351 in[6] = _mm_adds_epi16(in[6], final_rounding); 2352 in[7] = _mm_adds_epi16(in[7], final_rounding); 2353 in[8] = _mm_adds_epi16(in[8], final_rounding); 2354 in[9] = _mm_adds_epi16(in[9], final_rounding); 2355 in[10] = _mm_adds_epi16(in[10], final_rounding); 2356 in[11] = _mm_adds_epi16(in[11], final_rounding); 2357 in[12] = _mm_adds_epi16(in[12], final_rounding); 2358 in[13] = _mm_adds_epi16(in[13], final_rounding); 2359 in[14] = _mm_adds_epi16(in[14], final_rounding); 2360 in[15] = _mm_adds_epi16(in[15], final_rounding); 2361 2362 in[0] = _mm_srai_epi16(in[0], 6); 2363 in[1] = _mm_srai_epi16(in[1], 6); 2364 in[2] = _mm_srai_epi16(in[2], 6); 2365 in[3] = _mm_srai_epi16(in[3], 6); 2366 in[4] = _mm_srai_epi16(in[4], 6); 2367 in[5] = _mm_srai_epi16(in[5], 6); 2368 in[6] = _mm_srai_epi16(in[6], 6); 2369 in[7] = _mm_srai_epi16(in[7], 6); 2370 in[8] = _mm_srai_epi16(in[8], 6); 2371 in[9] = _mm_srai_epi16(in[9], 6); 2372 in[10] = _mm_srai_epi16(in[10], 6); 2373 in[11] = _mm_srai_epi16(in[11], 6); 2374 in[12] = _mm_srai_epi16(in[12], 6); 2375 in[13] = _mm_srai_epi16(in[13], 6); 2376 in[14] = _mm_srai_epi16(in[14], 6); 2377 in[15] = _mm_srai_epi16(in[15], 6); 2378 2379 RECON_AND_STORE(dest, in[0]); 2380 RECON_AND_STORE(dest, in[1]); 2381 RECON_AND_STORE(dest, in[2]); 2382 RECON_AND_STORE(dest, in[3]); 2383 RECON_AND_STORE(dest, in[4]); 2384 RECON_AND_STORE(dest, in[5]); 2385 RECON_AND_STORE(dest, in[6]); 2386 RECON_AND_STORE(dest, in[7]); 2387 RECON_AND_STORE(dest, in[8]); 2388 RECON_AND_STORE(dest, in[9]); 2389 RECON_AND_STORE(dest, in[10]); 2390 RECON_AND_STORE(dest, in[11]); 2391 RECON_AND_STORE(dest, in[12]); 2392 RECON_AND_STORE(dest, in[13]); 2393 RECON_AND_STORE(dest, in[14]); 2394 RECON_AND_STORE(dest, in[15]); 2395} 2396 2397void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 2398 int tx_type) { 2399 __m128i in0[16], in1[16]; 2400 2401 load_buffer_8x16(input, in0); 2402 input += 8; 2403 load_buffer_8x16(input, in1); 2404 2405 switch (tx_type) { 2406 case 0: // DCT_DCT 2407 idct16_1d_sse2(in0, in1); 2408 idct16_1d_sse2(in0, in1); 2409 break; 2410 case 1: // ADST_DCT 2411 idct16_1d_sse2(in0, in1); 2412 iadst16_1d_sse2(in0, in1); 2413 break; 2414 case 2: // DCT_ADST 2415 iadst16_1d_sse2(in0, in1); 2416 idct16_1d_sse2(in0, in1); 2417 break; 2418 case 3: // ADST_ADST 2419 iadst16_1d_sse2(in0, in1); 2420 iadst16_1d_sse2(in0, in1); 2421 break; 2422 default: 2423 assert(0); 2424 break; 2425 } 2426 2427 write_buffer_8x16(dest, in0, stride); 2428 dest += 8; 2429 write_buffer_8x16(dest, in1, stride); 2430} 2431 2432void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2433 int stride) { 2434 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2435 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2436 const __m128i zero = _mm_setzero_si128(); 2437 2438 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2439 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2440 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2441 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 2442 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2443 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 2444 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2445 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2446 2447 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2448 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2449 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2450 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 2451 2452 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2453 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2454 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2455 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2456 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2457 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2458 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2459 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2460 2461 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2462 2463 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 2464 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 2465 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 2466 in14 = zero, in15 = zero; 2467 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 2468 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 2469 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 2470 2471 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2472 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2473 stp1_8_0, stp1_12_0; 2474 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2475 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 2476 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2477 int i; 2478 // 1-D idct. Load input data. 2479 in0 = _mm_load_si128((const __m128i *)input); 2480 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 2481 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2482 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 2483 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2484 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 2485 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2486 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 2487 2488 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 2489 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 2490 2491 // Stage2 2492 { 2493 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 2494 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 2495 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 2496 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 2497 2498 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2499 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2500 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 2501 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 2502 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 2503 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 2504 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2505 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2506 2507 tmp0 = _mm_add_epi32(tmp0, rounding); 2508 tmp2 = _mm_add_epi32(tmp2, rounding); 2509 tmp4 = _mm_add_epi32(tmp4, rounding); 2510 tmp6 = _mm_add_epi32(tmp6, rounding); 2511 tmp1 = _mm_add_epi32(tmp1, rounding); 2512 tmp3 = _mm_add_epi32(tmp3, rounding); 2513 tmp5 = _mm_add_epi32(tmp5, rounding); 2514 tmp7 = _mm_add_epi32(tmp7, rounding); 2515 2516 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2517 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2518 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2519 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2520 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2521 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2522 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2523 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2524 2525 stp2_8 = _mm_packs_epi32(tmp0, zero); 2526 stp2_15 = _mm_packs_epi32(tmp2, zero); 2527 stp2_9 = _mm_packs_epi32(tmp4, zero); 2528 stp2_14 = _mm_packs_epi32(tmp6, zero); 2529 2530 stp2_10 = _mm_packs_epi32(tmp1, zero); 2531 stp2_13 = _mm_packs_epi32(tmp3, zero); 2532 stp2_11 = _mm_packs_epi32(tmp5, zero); 2533 stp2_12 = _mm_packs_epi32(tmp7, zero); 2534 } 2535 2536 // Stage3 2537 { 2538 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 2539 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 2540 2541 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2542 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2543 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 2544 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 2545 2546 tmp0 = _mm_add_epi32(tmp0, rounding); 2547 tmp2 = _mm_add_epi32(tmp2, rounding); 2548 tmp4 = _mm_add_epi32(tmp4, rounding); 2549 tmp6 = _mm_add_epi32(tmp6, rounding); 2550 2551 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2552 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2553 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2554 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2555 2556 stp1_4 = _mm_packs_epi32(tmp0, zero); 2557 stp1_7 = _mm_packs_epi32(tmp2, zero); 2558 stp1_5 = _mm_packs_epi32(tmp4, zero); 2559 stp1_6 = _mm_packs_epi32(tmp6, zero); 2560 2561 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); 2562 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 2563 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 2564 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 2565 2566 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 2567 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 2568 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 2569 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 2570 } 2571 2572 // Stage4 2573 { 2574 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 2575 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 2576 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 2577 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2578 2579 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2580 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2581 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 2582 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 2583 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2584 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2585 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2586 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2587 2588 tmp0 = _mm_add_epi32(tmp0, rounding); 2589 tmp2 = _mm_add_epi32(tmp2, rounding); 2590 tmp4 = _mm_add_epi32(tmp4, rounding); 2591 tmp6 = _mm_add_epi32(tmp6, rounding); 2592 tmp1 = _mm_add_epi32(tmp1, rounding); 2593 tmp3 = _mm_add_epi32(tmp3, rounding); 2594 tmp5 = _mm_add_epi32(tmp5, rounding); 2595 tmp7 = _mm_add_epi32(tmp7, rounding); 2596 2597 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2598 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2599 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2600 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2601 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2602 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2603 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2604 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2605 2606 stp2_0 = _mm_packs_epi32(tmp0, zero); 2607 stp2_1 = _mm_packs_epi32(tmp2, zero); 2608 stp2_2 = _mm_packs_epi32(tmp4, zero); 2609 stp2_3 = _mm_packs_epi32(tmp6, zero); 2610 stp2_9 = _mm_packs_epi32(tmp1, zero); 2611 stp2_14 = _mm_packs_epi32(tmp3, zero); 2612 stp2_10 = _mm_packs_epi32(tmp5, zero); 2613 stp2_13 = _mm_packs_epi32(tmp7, zero); 2614 2615 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 2616 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 2617 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 2618 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 2619 } 2620 2621 // Stage5 and Stage6 2622 { 2623 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 2624 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 2625 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 2626 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 2627 2628 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); 2629 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 2630 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 2631 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); 2632 2633 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); 2634 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 2635 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 2636 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); 2637 } 2638 2639 // Stage6 2640 { 2641 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 2642 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2643 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2644 2645 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2646 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2647 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2648 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2649 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2650 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2651 2652 tmp1 = _mm_add_epi32(tmp1, rounding); 2653 tmp3 = _mm_add_epi32(tmp3, rounding); 2654 tmp0 = _mm_add_epi32(tmp0, rounding); 2655 tmp2 = _mm_add_epi32(tmp2, rounding); 2656 tmp4 = _mm_add_epi32(tmp4, rounding); 2657 tmp6 = _mm_add_epi32(tmp6, rounding); 2658 2659 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2660 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2661 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2662 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2663 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2664 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2665 2666 stp1_5 = _mm_packs_epi32(tmp1, zero); 2667 stp1_6 = _mm_packs_epi32(tmp3, zero); 2668 stp2_10 = _mm_packs_epi32(tmp0, zero); 2669 stp2_13 = _mm_packs_epi32(tmp2, zero); 2670 stp2_11 = _mm_packs_epi32(tmp4, zero); 2671 stp2_12 = _mm_packs_epi32(tmp6, zero); 2672 2673 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); 2674 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 2675 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 2676 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 2677 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 2678 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 2679 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 2680 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 2681 } 2682 2683 // Stage7. Left 8x16 only. 2684 l0 = _mm_add_epi16(stp2_0, stp1_15); 2685 l1 = _mm_add_epi16(stp2_1, stp1_14); 2686 l2 = _mm_add_epi16(stp2_2, stp2_13); 2687 l3 = _mm_add_epi16(stp2_3, stp2_12); 2688 l4 = _mm_add_epi16(stp2_4, stp2_11); 2689 l5 = _mm_add_epi16(stp2_5, stp2_10); 2690 l6 = _mm_add_epi16(stp2_6, stp1_9); 2691 l7 = _mm_add_epi16(stp2_7, stp1_8); 2692 l8 = _mm_sub_epi16(stp2_7, stp1_8); 2693 l9 = _mm_sub_epi16(stp2_6, stp1_9); 2694 l10 = _mm_sub_epi16(stp2_5, stp2_10); 2695 l11 = _mm_sub_epi16(stp2_4, stp2_11); 2696 l12 = _mm_sub_epi16(stp2_3, stp2_12); 2697 l13 = _mm_sub_epi16(stp2_2, stp2_13); 2698 l14 = _mm_sub_epi16(stp2_1, stp1_14); 2699 l15 = _mm_sub_epi16(stp2_0, stp1_15); 2700 2701 // 2-D idct. We do 2 8x16 blocks. 2702 for (i = 0; i < 2; i++) { 2703 if (i == 0) 2704 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 2705 in5, in6, in7); 2706 2707 if (i == 1) 2708 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 2709 in4, in5, in6, in7); 2710 2711 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; 2712 2713 IDCT16_1D 2714 2715 // Stage7 2716 in0 = _mm_add_epi16(stp2_0, stp1_15); 2717 in1 = _mm_add_epi16(stp2_1, stp1_14); 2718 in2 = _mm_add_epi16(stp2_2, stp2_13); 2719 in3 = _mm_add_epi16(stp2_3, stp2_12); 2720 in4 = _mm_add_epi16(stp2_4, stp2_11); 2721 in5 = _mm_add_epi16(stp2_5, stp2_10); 2722 in6 = _mm_add_epi16(stp2_6, stp1_9); 2723 in7 = _mm_add_epi16(stp2_7, stp1_8); 2724 in8 = _mm_sub_epi16(stp2_7, stp1_8); 2725 in9 = _mm_sub_epi16(stp2_6, stp1_9); 2726 in10 = _mm_sub_epi16(stp2_5, stp2_10); 2727 in11 = _mm_sub_epi16(stp2_4, stp2_11); 2728 in12 = _mm_sub_epi16(stp2_3, stp2_12); 2729 in13 = _mm_sub_epi16(stp2_2, stp2_13); 2730 in14 = _mm_sub_epi16(stp2_1, stp1_14); 2731 in15 = _mm_sub_epi16(stp2_0, stp1_15); 2732 2733 // Final rounding and shift 2734 in0 = _mm_adds_epi16(in0, final_rounding); 2735 in1 = _mm_adds_epi16(in1, final_rounding); 2736 in2 = _mm_adds_epi16(in2, final_rounding); 2737 in3 = _mm_adds_epi16(in3, final_rounding); 2738 in4 = _mm_adds_epi16(in4, final_rounding); 2739 in5 = _mm_adds_epi16(in5, final_rounding); 2740 in6 = _mm_adds_epi16(in6, final_rounding); 2741 in7 = _mm_adds_epi16(in7, final_rounding); 2742 in8 = _mm_adds_epi16(in8, final_rounding); 2743 in9 = _mm_adds_epi16(in9, final_rounding); 2744 in10 = _mm_adds_epi16(in10, final_rounding); 2745 in11 = _mm_adds_epi16(in11, final_rounding); 2746 in12 = _mm_adds_epi16(in12, final_rounding); 2747 in13 = _mm_adds_epi16(in13, final_rounding); 2748 in14 = _mm_adds_epi16(in14, final_rounding); 2749 in15 = _mm_adds_epi16(in15, final_rounding); 2750 2751 in0 = _mm_srai_epi16(in0, 6); 2752 in1 = _mm_srai_epi16(in1, 6); 2753 in2 = _mm_srai_epi16(in2, 6); 2754 in3 = _mm_srai_epi16(in3, 6); 2755 in4 = _mm_srai_epi16(in4, 6); 2756 in5 = _mm_srai_epi16(in5, 6); 2757 in6 = _mm_srai_epi16(in6, 6); 2758 in7 = _mm_srai_epi16(in7, 6); 2759 in8 = _mm_srai_epi16(in8, 6); 2760 in9 = _mm_srai_epi16(in9, 6); 2761 in10 = _mm_srai_epi16(in10, 6); 2762 in11 = _mm_srai_epi16(in11, 6); 2763 in12 = _mm_srai_epi16(in12, 6); 2764 in13 = _mm_srai_epi16(in13, 6); 2765 in14 = _mm_srai_epi16(in14, 6); 2766 in15 = _mm_srai_epi16(in15, 6); 2767 2768 RECON_AND_STORE(dest, in0); 2769 RECON_AND_STORE(dest, in1); 2770 RECON_AND_STORE(dest, in2); 2771 RECON_AND_STORE(dest, in3); 2772 RECON_AND_STORE(dest, in4); 2773 RECON_AND_STORE(dest, in5); 2774 RECON_AND_STORE(dest, in6); 2775 RECON_AND_STORE(dest, in7); 2776 RECON_AND_STORE(dest, in8); 2777 RECON_AND_STORE(dest, in9); 2778 RECON_AND_STORE(dest, in10); 2779 RECON_AND_STORE(dest, in11); 2780 RECON_AND_STORE(dest, in12); 2781 RECON_AND_STORE(dest, in13); 2782 RECON_AND_STORE(dest, in14); 2783 RECON_AND_STORE(dest, in15); 2784 2785 dest += 8 - (stride * 16); 2786 } 2787} 2788 2789#define LOAD_DQCOEFF(reg, input) \ 2790 { \ 2791 reg = _mm_load_si128((const __m128i *) input); \ 2792 input += 8; \ 2793 } \ 2794 2795#define IDCT32_1D \ 2796/* Stage1 */ \ 2797{ \ 2798 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ 2799 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ 2800 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ 2801 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ 2802 \ 2803 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ 2804 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ 2805 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ 2806 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ 2807 \ 2808 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ 2809 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ 2810 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ 2811 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ 2812 \ 2813 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ 2814 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ 2815 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ 2816 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ 2817 \ 2818 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2819 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 2820 stp1_17, stp1_30) \ 2821 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 2822 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 2823 stp1_19, stp1_28) \ 2824 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2825 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2826 stp1_21, stp1_26) \ 2827 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2828 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2829 stp1_23, stp1_24) \ 2830} \ 2831\ 2832/* Stage2 */ \ 2833{ \ 2834 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ 2835 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ 2836 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ 2837 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ 2838 \ 2839 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ 2840 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ 2841 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ 2842 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ 2843 \ 2844 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 2845 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 2846 stp2_14) \ 2847 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 2848 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 2849 stp2_11, stp2_12) \ 2850 \ 2851 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 2852 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 2853 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 2854 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 2855 \ 2856 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 2857 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 2858 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 2859 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 2860 \ 2861 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 2862 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 2863 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 2864 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 2865 \ 2866 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 2867 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 2868 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 2869 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 2870} \ 2871\ 2872/* Stage3 */ \ 2873{ \ 2874 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ 2875 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ 2876 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ 2877 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ 2878 \ 2879 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 2880 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 2881 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2882 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2883 \ 2884 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2885 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2886 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2887 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2888 \ 2889 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 2890 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 2891 stp1_6) \ 2892 \ 2893 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 2894 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 2895 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 2896 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 2897 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 2898 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 2899 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 2900 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 2901 \ 2902 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2903 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2904 stp1_18, stp1_29) \ 2905 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2906 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2907 stp1_22, stp1_25) \ 2908 \ 2909 stp1_16 = stp2_16; \ 2910 stp1_31 = stp2_31; \ 2911 stp1_19 = stp2_19; \ 2912 stp1_20 = stp2_20; \ 2913 stp1_23 = stp2_23; \ 2914 stp1_24 = stp2_24; \ 2915 stp1_27 = stp2_27; \ 2916 stp1_28 = stp2_28; \ 2917} \ 2918\ 2919/* Stage4 */ \ 2920{ \ 2921 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ 2922 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ 2923 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ 2924 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ 2925 \ 2926 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 2927 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 2928 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2929 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2930 \ 2931 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 2932 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 2933 stp2_2, stp2_3) \ 2934 \ 2935 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 2936 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 2937 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 2938 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 2939 \ 2940 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2941 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2942 stp2_10, stp2_13) \ 2943 \ 2944 stp2_8 = stp1_8; \ 2945 stp2_15 = stp1_15; \ 2946 stp2_11 = stp1_11; \ 2947 stp2_12 = stp1_12; \ 2948 \ 2949 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2950 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2951 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2952 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2953 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2954 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2955 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2956 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2957 \ 2958 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2959 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2960 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2961 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2962 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2963 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2964 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2965 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2966} \ 2967\ 2968/* Stage5 */ \ 2969{ \ 2970 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2971 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2972 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2973 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2974 \ 2975 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2976 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2977 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2978 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2979 \ 2980 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2981 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2982 \ 2983 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 2984 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 2985 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 2986 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 2987 \ 2988 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2989 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2990 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2991 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2992 \ 2993 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2994 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2995 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2996 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2997 \ 2998 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2999 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 3000 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 3001 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 3002 \ 3003 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 3004 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 3005 \ 3006 stp1_4 = stp2_4; \ 3007 stp1_7 = stp2_7; \ 3008 \ 3009 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3010 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3011 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3012 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3013 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3014 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3015 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3016 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3017 \ 3018 stp1_16 = stp2_16; \ 3019 stp1_17 = stp2_17; \ 3020 \ 3021 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3022 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3023 stp1_19, stp1_28) \ 3024 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3025 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3026 stp1_21, stp1_26) \ 3027 \ 3028 stp1_22 = stp2_22; \ 3029 stp1_23 = stp2_23; \ 3030 stp1_24 = stp2_24; \ 3031 stp1_25 = stp2_25; \ 3032 stp1_30 = stp2_30; \ 3033 stp1_31 = stp2_31; \ 3034} \ 3035\ 3036/* Stage6 */ \ 3037{ \ 3038 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3039 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3040 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3041 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3042 \ 3043 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3044 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3045 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3046 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3047 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3048 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3049 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3050 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3051 \ 3052 stp2_8 = stp1_8; \ 3053 stp2_9 = stp1_9; \ 3054 stp2_14 = stp1_14; \ 3055 stp2_15 = stp1_15; \ 3056 \ 3057 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3058 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3059 stp2_13, stp2_11, stp2_12) \ 3060 \ 3061 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3062 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3063 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3064 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3065 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3066 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3067 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3068 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3069 \ 3070 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3071 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3072 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3073 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3074 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3075 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3076 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3077 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3078} \ 3079\ 3080/* Stage7 */ \ 3081{ \ 3082 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3083 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3084 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3085 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3086 \ 3087 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3088 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3089 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3090 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3091 \ 3092 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3093 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3094 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3095 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3096 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3097 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3098 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3099 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3100 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3101 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3102 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3103 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3104 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3105 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3106 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3107 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3108 \ 3109 stp1_16 = stp2_16; \ 3110 stp1_17 = stp2_17; \ 3111 stp1_18 = stp2_18; \ 3112 stp1_19 = stp2_19; \ 3113 \ 3114 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3115 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3116 stp1_21, stp1_26) \ 3117 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3118 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3119 stp1_23, stp1_24) \ 3120 \ 3121 stp1_28 = stp2_28; \ 3122 stp1_29 = stp2_29; \ 3123 stp1_30 = stp2_30; \ 3124 stp1_31 = stp2_31; \ 3125} 3126 3127// Only upper-left 8x8 has non-zero coeff 3128void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3129 int stride) { 3130 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3131 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3132 3133 // idct constants for each stage 3134 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3135 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3136 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3137 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3138 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3139 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3140 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3141 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3142 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3143 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3144 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3145 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3146 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3147 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3148 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3149 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3150 3151 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3152 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3153 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3154 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3155 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3156 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3157 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3158 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3159 3160 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3161 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3162 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3163 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3164 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3165 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3166 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3167 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3168 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3169 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3170 3171 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3172 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3173 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3174 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3175 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3176 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3177 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3178 3179 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3180 3181 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3182 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 3183 in24, in25, in26, in27, in28, in29, in30, in31; 3184 __m128i col[128]; 3185 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3186 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3187 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3188 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3189 stp1_30, stp1_31; 3190 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3191 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3192 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3193 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3194 stp2_30, stp2_31; 3195 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3196 int i, j, i32; 3197 3198 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3199 for (i = 0; i < 8; i++) { 3200 i32 = (i << 5); 3201 if (i == 0) { 3202 // First 1-D idct: first 8 rows 3203 // Load input data. 3204 LOAD_DQCOEFF(in0, input); 3205 LOAD_DQCOEFF(in8, input); 3206 LOAD_DQCOEFF(in16, input); 3207 LOAD_DQCOEFF(in24, input); 3208 LOAD_DQCOEFF(in1, input); 3209 LOAD_DQCOEFF(in9, input); 3210 LOAD_DQCOEFF(in17, input); 3211 LOAD_DQCOEFF(in25, input); 3212 LOAD_DQCOEFF(in2, input); 3213 LOAD_DQCOEFF(in10, input); 3214 LOAD_DQCOEFF(in18, input); 3215 LOAD_DQCOEFF(in26, input); 3216 LOAD_DQCOEFF(in3, input); 3217 LOAD_DQCOEFF(in11, input); 3218 LOAD_DQCOEFF(in19, input); 3219 LOAD_DQCOEFF(in27, input); 3220 3221 LOAD_DQCOEFF(in4, input); 3222 LOAD_DQCOEFF(in12, input); 3223 LOAD_DQCOEFF(in20, input); 3224 LOAD_DQCOEFF(in28, input); 3225 LOAD_DQCOEFF(in5, input); 3226 LOAD_DQCOEFF(in13, input); 3227 LOAD_DQCOEFF(in21, input); 3228 LOAD_DQCOEFF(in29, input); 3229 LOAD_DQCOEFF(in6, input); 3230 LOAD_DQCOEFF(in14, input); 3231 LOAD_DQCOEFF(in22, input); 3232 LOAD_DQCOEFF(in30, input); 3233 LOAD_DQCOEFF(in7, input); 3234 LOAD_DQCOEFF(in15, input); 3235 LOAD_DQCOEFF(in23, input); 3236 LOAD_DQCOEFF(in31, input); 3237 3238 // Transpose 32x8 block to 8x32 block 3239 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3240 in4, in5, in6, in7); 3241 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3242 in10, in11, in12, in13, in14, in15); 3243 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 3244 in18, in19, in20, in21, in22, in23); 3245 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 3246 in26, in27, in28, in29, in30, in31); 3247 } else if (i < 4) { 3248 // First 1-D idct: next 24 zero-coeff rows 3249 col[i32 + 0] = _mm_setzero_si128(); 3250 col[i32 + 1] = _mm_setzero_si128(); 3251 col[i32 + 2] = _mm_setzero_si128(); 3252 col[i32 + 3] = _mm_setzero_si128(); 3253 col[i32 + 4] = _mm_setzero_si128(); 3254 col[i32 + 5] = _mm_setzero_si128(); 3255 col[i32 + 6] = _mm_setzero_si128(); 3256 col[i32 + 7] = _mm_setzero_si128(); 3257 col[i32 + 8] = _mm_setzero_si128(); 3258 col[i32 + 9] = _mm_setzero_si128(); 3259 col[i32 + 10] = _mm_setzero_si128(); 3260 col[i32 + 11] = _mm_setzero_si128(); 3261 col[i32 + 12] = _mm_setzero_si128(); 3262 col[i32 + 13] = _mm_setzero_si128(); 3263 col[i32 + 14] = _mm_setzero_si128(); 3264 col[i32 + 15] = _mm_setzero_si128(); 3265 col[i32 + 16] = _mm_setzero_si128(); 3266 col[i32 + 17] = _mm_setzero_si128(); 3267 col[i32 + 18] = _mm_setzero_si128(); 3268 col[i32 + 19] = _mm_setzero_si128(); 3269 col[i32 + 20] = _mm_setzero_si128(); 3270 col[i32 + 21] = _mm_setzero_si128(); 3271 col[i32 + 22] = _mm_setzero_si128(); 3272 col[i32 + 23] = _mm_setzero_si128(); 3273 col[i32 + 24] = _mm_setzero_si128(); 3274 col[i32 + 25] = _mm_setzero_si128(); 3275 col[i32 + 26] = _mm_setzero_si128(); 3276 col[i32 + 27] = _mm_setzero_si128(); 3277 col[i32 + 28] = _mm_setzero_si128(); 3278 col[i32 + 29] = _mm_setzero_si128(); 3279 col[i32 + 30] = _mm_setzero_si128(); 3280 col[i32 + 31] = _mm_setzero_si128(); 3281 continue; 3282 } else { 3283 // Second 1-D idct 3284 j = i - 4; 3285 3286 // Transpose 32x8 block to 8x32 block 3287 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3288 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3289 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 3290 in5, in6, in7); 3291 j += 4; 3292 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3293 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3294 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 3295 in11, in12, in13, in14, in15); 3296 j += 4; 3297 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3298 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3299 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 3300 in19, in20, in21, in22, in23); 3301 j += 4; 3302 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3303 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3304 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 3305 in28, in29, in30, in31); 3306 } 3307 3308 IDCT32_1D 3309 3310 // final stage 3311 if (i < 4) { 3312 // 1_D: Store 32 intermediate results for each 8x32 block. 3313 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3314 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3315 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3316 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3317 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3318 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3319 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3320 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3321 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3322 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3323 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3324 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3325 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3326 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3327 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3328 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3329 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3330 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3331 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3332 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3333 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3334 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3335 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3336 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3337 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3338 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3339 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3340 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3341 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3342 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3343 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3344 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3345 } else { 3346 const __m128i zero = _mm_setzero_si128(); 3347 3348 // 2_D: Calculate the results and store them to destination. 3349 in0 = _mm_add_epi16(stp1_0, stp1_31); 3350 in1 = _mm_add_epi16(stp1_1, stp1_30); 3351 in2 = _mm_add_epi16(stp1_2, stp1_29); 3352 in3 = _mm_add_epi16(stp1_3, stp1_28); 3353 in4 = _mm_add_epi16(stp1_4, stp1_27); 3354 in5 = _mm_add_epi16(stp1_5, stp1_26); 3355 in6 = _mm_add_epi16(stp1_6, stp1_25); 3356 in7 = _mm_add_epi16(stp1_7, stp1_24); 3357 in8 = _mm_add_epi16(stp1_8, stp1_23); 3358 in9 = _mm_add_epi16(stp1_9, stp1_22); 3359 in10 = _mm_add_epi16(stp1_10, stp1_21); 3360 in11 = _mm_add_epi16(stp1_11, stp1_20); 3361 in12 = _mm_add_epi16(stp1_12, stp1_19); 3362 in13 = _mm_add_epi16(stp1_13, stp1_18); 3363 in14 = _mm_add_epi16(stp1_14, stp1_17); 3364 in15 = _mm_add_epi16(stp1_15, stp1_16); 3365 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3366 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3367 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3368 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3369 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3370 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3371 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3372 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3373 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3374 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3375 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3376 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3377 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3378 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3379 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3380 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3381 3382 // Final rounding and shift 3383 in0 = _mm_adds_epi16(in0, final_rounding); 3384 in1 = _mm_adds_epi16(in1, final_rounding); 3385 in2 = _mm_adds_epi16(in2, final_rounding); 3386 in3 = _mm_adds_epi16(in3, final_rounding); 3387 in4 = _mm_adds_epi16(in4, final_rounding); 3388 in5 = _mm_adds_epi16(in5, final_rounding); 3389 in6 = _mm_adds_epi16(in6, final_rounding); 3390 in7 = _mm_adds_epi16(in7, final_rounding); 3391 in8 = _mm_adds_epi16(in8, final_rounding); 3392 in9 = _mm_adds_epi16(in9, final_rounding); 3393 in10 = _mm_adds_epi16(in10, final_rounding); 3394 in11 = _mm_adds_epi16(in11, final_rounding); 3395 in12 = _mm_adds_epi16(in12, final_rounding); 3396 in13 = _mm_adds_epi16(in13, final_rounding); 3397 in14 = _mm_adds_epi16(in14, final_rounding); 3398 in15 = _mm_adds_epi16(in15, final_rounding); 3399 in16 = _mm_adds_epi16(in16, final_rounding); 3400 in17 = _mm_adds_epi16(in17, final_rounding); 3401 in18 = _mm_adds_epi16(in18, final_rounding); 3402 in19 = _mm_adds_epi16(in19, final_rounding); 3403 in20 = _mm_adds_epi16(in20, final_rounding); 3404 in21 = _mm_adds_epi16(in21, final_rounding); 3405 in22 = _mm_adds_epi16(in22, final_rounding); 3406 in23 = _mm_adds_epi16(in23, final_rounding); 3407 in24 = _mm_adds_epi16(in24, final_rounding); 3408 in25 = _mm_adds_epi16(in25, final_rounding); 3409 in26 = _mm_adds_epi16(in26, final_rounding); 3410 in27 = _mm_adds_epi16(in27, final_rounding); 3411 in28 = _mm_adds_epi16(in28, final_rounding); 3412 in29 = _mm_adds_epi16(in29, final_rounding); 3413 in30 = _mm_adds_epi16(in30, final_rounding); 3414 in31 = _mm_adds_epi16(in31, final_rounding); 3415 3416 in0 = _mm_srai_epi16(in0, 6); 3417 in1 = _mm_srai_epi16(in1, 6); 3418 in2 = _mm_srai_epi16(in2, 6); 3419 in3 = _mm_srai_epi16(in3, 6); 3420 in4 = _mm_srai_epi16(in4, 6); 3421 in5 = _mm_srai_epi16(in5, 6); 3422 in6 = _mm_srai_epi16(in6, 6); 3423 in7 = _mm_srai_epi16(in7, 6); 3424 in8 = _mm_srai_epi16(in8, 6); 3425 in9 = _mm_srai_epi16(in9, 6); 3426 in10 = _mm_srai_epi16(in10, 6); 3427 in11 = _mm_srai_epi16(in11, 6); 3428 in12 = _mm_srai_epi16(in12, 6); 3429 in13 = _mm_srai_epi16(in13, 6); 3430 in14 = _mm_srai_epi16(in14, 6); 3431 in15 = _mm_srai_epi16(in15, 6); 3432 in16 = _mm_srai_epi16(in16, 6); 3433 in17 = _mm_srai_epi16(in17, 6); 3434 in18 = _mm_srai_epi16(in18, 6); 3435 in19 = _mm_srai_epi16(in19, 6); 3436 in20 = _mm_srai_epi16(in20, 6); 3437 in21 = _mm_srai_epi16(in21, 6); 3438 in22 = _mm_srai_epi16(in22, 6); 3439 in23 = _mm_srai_epi16(in23, 6); 3440 in24 = _mm_srai_epi16(in24, 6); 3441 in25 = _mm_srai_epi16(in25, 6); 3442 in26 = _mm_srai_epi16(in26, 6); 3443 in27 = _mm_srai_epi16(in27, 6); 3444 in28 = _mm_srai_epi16(in28, 6); 3445 in29 = _mm_srai_epi16(in29, 6); 3446 in30 = _mm_srai_epi16(in30, 6); 3447 in31 = _mm_srai_epi16(in31, 6); 3448 3449 RECON_AND_STORE(dest, in0); 3450 RECON_AND_STORE(dest, in1); 3451 RECON_AND_STORE(dest, in2); 3452 RECON_AND_STORE(dest, in3); 3453 RECON_AND_STORE(dest, in4); 3454 RECON_AND_STORE(dest, in5); 3455 RECON_AND_STORE(dest, in6); 3456 RECON_AND_STORE(dest, in7); 3457 RECON_AND_STORE(dest, in8); 3458 RECON_AND_STORE(dest, in9); 3459 RECON_AND_STORE(dest, in10); 3460 RECON_AND_STORE(dest, in11); 3461 RECON_AND_STORE(dest, in12); 3462 RECON_AND_STORE(dest, in13); 3463 RECON_AND_STORE(dest, in14); 3464 RECON_AND_STORE(dest, in15); 3465 RECON_AND_STORE(dest, in16); 3466 RECON_AND_STORE(dest, in17); 3467 RECON_AND_STORE(dest, in18); 3468 RECON_AND_STORE(dest, in19); 3469 RECON_AND_STORE(dest, in20); 3470 RECON_AND_STORE(dest, in21); 3471 RECON_AND_STORE(dest, in22); 3472 RECON_AND_STORE(dest, in23); 3473 RECON_AND_STORE(dest, in24); 3474 RECON_AND_STORE(dest, in25); 3475 RECON_AND_STORE(dest, in26); 3476 RECON_AND_STORE(dest, in27); 3477 RECON_AND_STORE(dest, in28); 3478 RECON_AND_STORE(dest, in29); 3479 RECON_AND_STORE(dest, in30); 3480 RECON_AND_STORE(dest, in31); 3481 3482 dest += 8 - (stride * 32); 3483 } 3484 } 3485} 3486 3487void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3488 int stride) { 3489 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3490 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3491 3492 // idct constants for each stage 3493 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3494 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3495 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3496 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3497 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3498 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3499 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3500 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3501 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3502 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3503 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3504 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3505 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3506 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3507 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3508 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3509 3510 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3511 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3512 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3513 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3514 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3515 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3516 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3517 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3518 3519 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3520 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3521 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3522 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3523 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3524 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3525 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3526 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3527 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3528 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3529 3530 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3531 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3532 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3533 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3534 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3535 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3536 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3537 3538 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3539 3540 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3541 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 3542 in24, in25, in26, in27, in28, in29, in30, in31; 3543 __m128i col[128]; 3544 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3545 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3546 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3547 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3548 stp1_30, stp1_31; 3549 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3550 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3551 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3552 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3553 stp2_30, stp2_31; 3554 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3555 int i, j, i32; 3556 __m128i zero_idx[16]; 3557 int zero_flag[2]; 3558 3559 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3560 for (i = 0; i < 8; i++) { 3561 i32 = (i << 5); 3562 if (i < 4) { 3563 // First 1-D idct 3564 // Load input data. 3565 LOAD_DQCOEFF(in0, input); 3566 LOAD_DQCOEFF(in8, input); 3567 LOAD_DQCOEFF(in16, input); 3568 LOAD_DQCOEFF(in24, input); 3569 LOAD_DQCOEFF(in1, input); 3570 LOAD_DQCOEFF(in9, input); 3571 LOAD_DQCOEFF(in17, input); 3572 LOAD_DQCOEFF(in25, input); 3573 LOAD_DQCOEFF(in2, input); 3574 LOAD_DQCOEFF(in10, input); 3575 LOAD_DQCOEFF(in18, input); 3576 LOAD_DQCOEFF(in26, input); 3577 LOAD_DQCOEFF(in3, input); 3578 LOAD_DQCOEFF(in11, input); 3579 LOAD_DQCOEFF(in19, input); 3580 LOAD_DQCOEFF(in27, input); 3581 3582 LOAD_DQCOEFF(in4, input); 3583 LOAD_DQCOEFF(in12, input); 3584 LOAD_DQCOEFF(in20, input); 3585 LOAD_DQCOEFF(in28, input); 3586 LOAD_DQCOEFF(in5, input); 3587 LOAD_DQCOEFF(in13, input); 3588 LOAD_DQCOEFF(in21, input); 3589 LOAD_DQCOEFF(in29, input); 3590 LOAD_DQCOEFF(in6, input); 3591 LOAD_DQCOEFF(in14, input); 3592 LOAD_DQCOEFF(in22, input); 3593 LOAD_DQCOEFF(in30, input); 3594 LOAD_DQCOEFF(in7, input); 3595 LOAD_DQCOEFF(in15, input); 3596 LOAD_DQCOEFF(in23, input); 3597 LOAD_DQCOEFF(in31, input); 3598 3599 // checking if all entries are zero 3600 zero_idx[0] = _mm_or_si128(in0, in1); 3601 zero_idx[1] = _mm_or_si128(in2, in3); 3602 zero_idx[2] = _mm_or_si128(in4, in5); 3603 zero_idx[3] = _mm_or_si128(in6, in7); 3604 zero_idx[4] = _mm_or_si128(in8, in9); 3605 zero_idx[5] = _mm_or_si128(in10, in11); 3606 zero_idx[6] = _mm_or_si128(in12, in13); 3607 zero_idx[7] = _mm_or_si128(in14, in15); 3608 zero_idx[8] = _mm_or_si128(in16, in17); 3609 zero_idx[9] = _mm_or_si128(in18, in19); 3610 zero_idx[10] = _mm_or_si128(in20, in21); 3611 zero_idx[11] = _mm_or_si128(in22, in23); 3612 zero_idx[12] = _mm_or_si128(in24, in25); 3613 zero_idx[13] = _mm_or_si128(in26, in27); 3614 zero_idx[14] = _mm_or_si128(in28, in29); 3615 zero_idx[15] = _mm_or_si128(in30, in31); 3616 3617 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3618 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3619 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3620 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3621 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3622 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3623 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3624 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3625 3626 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3627 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3628 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3629 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3630 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3631 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3632 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3633 3634 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 3635 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); 3636 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); 3637 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); 3638 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); 3639 3640 if (!zero_flag[0] && !zero_flag[1]) { 3641 col[i32 + 0] = _mm_setzero_si128(); 3642 col[i32 + 1] = _mm_setzero_si128(); 3643 col[i32 + 2] = _mm_setzero_si128(); 3644 col[i32 + 3] = _mm_setzero_si128(); 3645 col[i32 + 4] = _mm_setzero_si128(); 3646 col[i32 + 5] = _mm_setzero_si128(); 3647 col[i32 + 6] = _mm_setzero_si128(); 3648 col[i32 + 7] = _mm_setzero_si128(); 3649 col[i32 + 8] = _mm_setzero_si128(); 3650 col[i32 + 9] = _mm_setzero_si128(); 3651 col[i32 + 10] = _mm_setzero_si128(); 3652 col[i32 + 11] = _mm_setzero_si128(); 3653 col[i32 + 12] = _mm_setzero_si128(); 3654 col[i32 + 13] = _mm_setzero_si128(); 3655 col[i32 + 14] = _mm_setzero_si128(); 3656 col[i32 + 15] = _mm_setzero_si128(); 3657 col[i32 + 16] = _mm_setzero_si128(); 3658 col[i32 + 17] = _mm_setzero_si128(); 3659 col[i32 + 18] = _mm_setzero_si128(); 3660 col[i32 + 19] = _mm_setzero_si128(); 3661 col[i32 + 20] = _mm_setzero_si128(); 3662 col[i32 + 21] = _mm_setzero_si128(); 3663 col[i32 + 22] = _mm_setzero_si128(); 3664 col[i32 + 23] = _mm_setzero_si128(); 3665 col[i32 + 24] = _mm_setzero_si128(); 3666 col[i32 + 25] = _mm_setzero_si128(); 3667 col[i32 + 26] = _mm_setzero_si128(); 3668 col[i32 + 27] = _mm_setzero_si128(); 3669 col[i32 + 28] = _mm_setzero_si128(); 3670 col[i32 + 29] = _mm_setzero_si128(); 3671 col[i32 + 30] = _mm_setzero_si128(); 3672 col[i32 + 31] = _mm_setzero_si128(); 3673 continue; 3674 } 3675 3676 // Transpose 32x8 block to 8x32 block 3677 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3678 in4, in5, in6, in7); 3679 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3680 in10, in11, in12, in13, in14, in15); 3681 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 3682 in18, in19, in20, in21, in22, in23); 3683 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 3684 in26, in27, in28, in29, in30, in31); 3685 } else { 3686 // Second 1-D idct 3687 j = i - 4; 3688 3689 // Transpose 32x8 block to 8x32 block 3690 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3691 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3692 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 3693 in5, in6, in7); 3694 j += 4; 3695 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3696 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3697 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 3698 in11, in12, in13, in14, in15); 3699 j += 4; 3700 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3701 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3702 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 3703 in19, in20, in21, in22, in23); 3704 j += 4; 3705 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3706 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3707 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 3708 in28, in29, in30, in31); 3709 } 3710 3711 IDCT32_1D 3712 3713 // final stage 3714 if (i < 4) { 3715 // 1_D: Store 32 intermediate results for each 8x32 block. 3716 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3717 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3718 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3719 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3720 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3721 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3722 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3723 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3724 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3725 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3726 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3727 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3728 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3729 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3730 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3731 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3732 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3733 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3734 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3735 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3736 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3737 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3738 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3739 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3740 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3741 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3742 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3743 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3744 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3745 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3746 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3747 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3748 } else { 3749 const __m128i zero = _mm_setzero_si128(); 3750 3751 // 2_D: Calculate the results and store them to destination. 3752 in0 = _mm_add_epi16(stp1_0, stp1_31); 3753 in1 = _mm_add_epi16(stp1_1, stp1_30); 3754 in2 = _mm_add_epi16(stp1_2, stp1_29); 3755 in3 = _mm_add_epi16(stp1_3, stp1_28); 3756 in4 = _mm_add_epi16(stp1_4, stp1_27); 3757 in5 = _mm_add_epi16(stp1_5, stp1_26); 3758 in6 = _mm_add_epi16(stp1_6, stp1_25); 3759 in7 = _mm_add_epi16(stp1_7, stp1_24); 3760 in8 = _mm_add_epi16(stp1_8, stp1_23); 3761 in9 = _mm_add_epi16(stp1_9, stp1_22); 3762 in10 = _mm_add_epi16(stp1_10, stp1_21); 3763 in11 = _mm_add_epi16(stp1_11, stp1_20); 3764 in12 = _mm_add_epi16(stp1_12, stp1_19); 3765 in13 = _mm_add_epi16(stp1_13, stp1_18); 3766 in14 = _mm_add_epi16(stp1_14, stp1_17); 3767 in15 = _mm_add_epi16(stp1_15, stp1_16); 3768 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3769 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3770 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3771 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3772 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3773 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3774 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3775 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3776 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3777 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3778 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3779 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3780 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3781 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3782 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3783 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3784 3785 // Final rounding and shift 3786 in0 = _mm_adds_epi16(in0, final_rounding); 3787 in1 = _mm_adds_epi16(in1, final_rounding); 3788 in2 = _mm_adds_epi16(in2, final_rounding); 3789 in3 = _mm_adds_epi16(in3, final_rounding); 3790 in4 = _mm_adds_epi16(in4, final_rounding); 3791 in5 = _mm_adds_epi16(in5, final_rounding); 3792 in6 = _mm_adds_epi16(in6, final_rounding); 3793 in7 = _mm_adds_epi16(in7, final_rounding); 3794 in8 = _mm_adds_epi16(in8, final_rounding); 3795 in9 = _mm_adds_epi16(in9, final_rounding); 3796 in10 = _mm_adds_epi16(in10, final_rounding); 3797 in11 = _mm_adds_epi16(in11, final_rounding); 3798 in12 = _mm_adds_epi16(in12, final_rounding); 3799 in13 = _mm_adds_epi16(in13, final_rounding); 3800 in14 = _mm_adds_epi16(in14, final_rounding); 3801 in15 = _mm_adds_epi16(in15, final_rounding); 3802 in16 = _mm_adds_epi16(in16, final_rounding); 3803 in17 = _mm_adds_epi16(in17, final_rounding); 3804 in18 = _mm_adds_epi16(in18, final_rounding); 3805 in19 = _mm_adds_epi16(in19, final_rounding); 3806 in20 = _mm_adds_epi16(in20, final_rounding); 3807 in21 = _mm_adds_epi16(in21, final_rounding); 3808 in22 = _mm_adds_epi16(in22, final_rounding); 3809 in23 = _mm_adds_epi16(in23, final_rounding); 3810 in24 = _mm_adds_epi16(in24, final_rounding); 3811 in25 = _mm_adds_epi16(in25, final_rounding); 3812 in26 = _mm_adds_epi16(in26, final_rounding); 3813 in27 = _mm_adds_epi16(in27, final_rounding); 3814 in28 = _mm_adds_epi16(in28, final_rounding); 3815 in29 = _mm_adds_epi16(in29, final_rounding); 3816 in30 = _mm_adds_epi16(in30, final_rounding); 3817 in31 = _mm_adds_epi16(in31, final_rounding); 3818 3819 in0 = _mm_srai_epi16(in0, 6); 3820 in1 = _mm_srai_epi16(in1, 6); 3821 in2 = _mm_srai_epi16(in2, 6); 3822 in3 = _mm_srai_epi16(in3, 6); 3823 in4 = _mm_srai_epi16(in4, 6); 3824 in5 = _mm_srai_epi16(in5, 6); 3825 in6 = _mm_srai_epi16(in6, 6); 3826 in7 = _mm_srai_epi16(in7, 6); 3827 in8 = _mm_srai_epi16(in8, 6); 3828 in9 = _mm_srai_epi16(in9, 6); 3829 in10 = _mm_srai_epi16(in10, 6); 3830 in11 = _mm_srai_epi16(in11, 6); 3831 in12 = _mm_srai_epi16(in12, 6); 3832 in13 = _mm_srai_epi16(in13, 6); 3833 in14 = _mm_srai_epi16(in14, 6); 3834 in15 = _mm_srai_epi16(in15, 6); 3835 in16 = _mm_srai_epi16(in16, 6); 3836 in17 = _mm_srai_epi16(in17, 6); 3837 in18 = _mm_srai_epi16(in18, 6); 3838 in19 = _mm_srai_epi16(in19, 6); 3839 in20 = _mm_srai_epi16(in20, 6); 3840 in21 = _mm_srai_epi16(in21, 6); 3841 in22 = _mm_srai_epi16(in22, 6); 3842 in23 = _mm_srai_epi16(in23, 6); 3843 in24 = _mm_srai_epi16(in24, 6); 3844 in25 = _mm_srai_epi16(in25, 6); 3845 in26 = _mm_srai_epi16(in26, 6); 3846 in27 = _mm_srai_epi16(in27, 6); 3847 in28 = _mm_srai_epi16(in28, 6); 3848 in29 = _mm_srai_epi16(in29, 6); 3849 in30 = _mm_srai_epi16(in30, 6); 3850 in31 = _mm_srai_epi16(in31, 6); 3851 3852 RECON_AND_STORE(dest, in0); 3853 RECON_AND_STORE(dest, in1); 3854 RECON_AND_STORE(dest, in2); 3855 RECON_AND_STORE(dest, in3); 3856 RECON_AND_STORE(dest, in4); 3857 RECON_AND_STORE(dest, in5); 3858 RECON_AND_STORE(dest, in6); 3859 RECON_AND_STORE(dest, in7); 3860 RECON_AND_STORE(dest, in8); 3861 RECON_AND_STORE(dest, in9); 3862 RECON_AND_STORE(dest, in10); 3863 RECON_AND_STORE(dest, in11); 3864 RECON_AND_STORE(dest, in12); 3865 RECON_AND_STORE(dest, in13); 3866 RECON_AND_STORE(dest, in14); 3867 RECON_AND_STORE(dest, in15); 3868 RECON_AND_STORE(dest, in16); 3869 RECON_AND_STORE(dest, in17); 3870 RECON_AND_STORE(dest, in18); 3871 RECON_AND_STORE(dest, in19); 3872 RECON_AND_STORE(dest, in20); 3873 RECON_AND_STORE(dest, in21); 3874 RECON_AND_STORE(dest, in22); 3875 RECON_AND_STORE(dest, in23); 3876 RECON_AND_STORE(dest, in24); 3877 RECON_AND_STORE(dest, in25); 3878 RECON_AND_STORE(dest, in26); 3879 RECON_AND_STORE(dest, in27); 3880 RECON_AND_STORE(dest, in28); 3881 RECON_AND_STORE(dest, in29); 3882 RECON_AND_STORE(dest, in30); 3883 RECON_AND_STORE(dest, in31); 3884 3885 dest += 8 - (stride * 32); 3886 } 3887 } 3888} //NOLINT 3889 3890void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 3891 __m128i dc_value; 3892 const __m128i zero = _mm_setzero_si128(); 3893 int a, i; 3894 3895 a = dct_const_round_shift(input[0] * cospi_16_64); 3896 a = dct_const_round_shift(a * cospi_16_64); 3897 a = ROUND_POWER_OF_TWO(a, 6); 3898 3899 dc_value = _mm_set1_epi16(a); 3900 3901 for (i = 0; i < 4; ++i) { 3902 RECON_AND_STORE(dest, dc_value); 3903 RECON_AND_STORE(dest, dc_value); 3904 RECON_AND_STORE(dest, dc_value); 3905 RECON_AND_STORE(dest, dc_value); 3906 RECON_AND_STORE(dest, dc_value); 3907 RECON_AND_STORE(dest, dc_value); 3908 RECON_AND_STORE(dest, dc_value); 3909 RECON_AND_STORE(dest, dc_value); 3910 RECON_AND_STORE(dest, dc_value); 3911 RECON_AND_STORE(dest, dc_value); 3912 RECON_AND_STORE(dest, dc_value); 3913 RECON_AND_STORE(dest, dc_value); 3914 RECON_AND_STORE(dest, dc_value); 3915 RECON_AND_STORE(dest, dc_value); 3916 RECON_AND_STORE(dest, dc_value); 3917 RECON_AND_STORE(dest, dc_value); 3918 RECON_AND_STORE(dest, dc_value); 3919 RECON_AND_STORE(dest, dc_value); 3920 RECON_AND_STORE(dest, dc_value); 3921 RECON_AND_STORE(dest, dc_value); 3922 RECON_AND_STORE(dest, dc_value); 3923 RECON_AND_STORE(dest, dc_value); 3924 RECON_AND_STORE(dest, dc_value); 3925 RECON_AND_STORE(dest, dc_value); 3926 RECON_AND_STORE(dest, dc_value); 3927 RECON_AND_STORE(dest, dc_value); 3928 RECON_AND_STORE(dest, dc_value); 3929 RECON_AND_STORE(dest, dc_value); 3930 RECON_AND_STORE(dest, dc_value); 3931 RECON_AND_STORE(dest, dc_value); 3932 RECON_AND_STORE(dest, dc_value); 3933 RECON_AND_STORE(dest, dc_value); 3934 dest += 8 - (stride * 32); 3935 } 3936} 3937