vp9_idct_intrin_sse2.c revision 5ae7ac49f08a179e4f054d99fcfc9dce78d26e58
1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13#include "./vpx_config.h" 14#include "vpx/vpx_integer.h" 15#include "vp9/common/vp9_common.h" 16#include "vp9/common/vp9_idct.h" 17 18void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 19 const __m128i zero = _mm_setzero_si128(); 20 const __m128i eight = _mm_set1_epi16(8); 21 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 22 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 23 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 24 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 25 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 26 __m128i input0, input1, input2, input3; 27 28 // Rows 29 input0 = _mm_loadl_epi64((const __m128i *)input); 30 input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); 31 input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); 32 input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); 33 34 // Construct i3, i1, i3, i1, i2, i0, i2, i0 35 input0 = _mm_shufflelo_epi16(input0, 0xd8); 36 input1 = _mm_shufflelo_epi16(input1, 0xd8); 37 input2 = _mm_shufflelo_epi16(input2, 0xd8); 38 input3 = _mm_shufflelo_epi16(input3, 0xd8); 39 40 input0 = _mm_unpacklo_epi32(input0, input0); 41 input1 = _mm_unpacklo_epi32(input1, input1); 42 input2 = _mm_unpacklo_epi32(input2, input2); 43 input3 = _mm_unpacklo_epi32(input3, input3); 44 45 // Stage 1 46 input0 = _mm_madd_epi16(input0, cst); 47 input1 = _mm_madd_epi16(input1, cst); 48 input2 = _mm_madd_epi16(input2, cst); 49 input3 = _mm_madd_epi16(input3, cst); 50 51 input0 = _mm_add_epi32(input0, rounding); 52 input1 = _mm_add_epi32(input1, rounding); 53 input2 = _mm_add_epi32(input2, rounding); 54 input3 = _mm_add_epi32(input3, rounding); 55 56 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 57 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 58 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 59 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 60 61 // Stage 2 62 input0 = _mm_packs_epi32(input0, zero); 63 input1 = _mm_packs_epi32(input1, zero); 64 input2 = _mm_packs_epi32(input2, zero); 65 input3 = _mm_packs_epi32(input3, zero); 66 67 // Transpose 68 input1 = _mm_unpacklo_epi16(input0, input1); 69 input3 = _mm_unpacklo_epi16(input2, input3); 70 input0 = _mm_unpacklo_epi32(input1, input3); 71 input1 = _mm_unpackhi_epi32(input1, input3); 72 73 // Switch column2, column 3, and then, we got: 74 // input2: column1, column 0; input3: column2, column 3. 75 input1 = _mm_shuffle_epi32(input1, 0x4e); 76 input2 = _mm_add_epi16(input0, input1); 77 input3 = _mm_sub_epi16(input0, input1); 78 79 // Columns 80 // Construct i3, i1, i3, i1, i2, i0, i2, i0 81 input0 = _mm_shufflelo_epi16(input2, 0xd8); 82 input1 = _mm_shufflehi_epi16(input2, 0xd8); 83 input2 = _mm_shufflehi_epi16(input3, 0xd8); 84 input3 = _mm_shufflelo_epi16(input3, 0xd8); 85 86 input0 = _mm_unpacklo_epi32(input0, input0); 87 input1 = _mm_unpackhi_epi32(input1, input1); 88 input2 = _mm_unpackhi_epi32(input2, input2); 89 input3 = _mm_unpacklo_epi32(input3, input3); 90 91 // Stage 1 92 input0 = _mm_madd_epi16(input0, cst); 93 input1 = _mm_madd_epi16(input1, cst); 94 input2 = _mm_madd_epi16(input2, cst); 95 input3 = _mm_madd_epi16(input3, cst); 96 97 input0 = _mm_add_epi32(input0, rounding); 98 input1 = _mm_add_epi32(input1, rounding); 99 input2 = _mm_add_epi32(input2, rounding); 100 input3 = _mm_add_epi32(input3, rounding); 101 102 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 103 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 104 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 105 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 106 107 // Stage 2 108 input0 = _mm_packs_epi32(input0, zero); 109 input1 = _mm_packs_epi32(input1, zero); 110 input2 = _mm_packs_epi32(input2, zero); 111 input3 = _mm_packs_epi32(input3, zero); 112 113 // Transpose 114 input1 = _mm_unpacklo_epi16(input0, input1); 115 input3 = _mm_unpacklo_epi16(input2, input3); 116 input0 = _mm_unpacklo_epi32(input1, input3); 117 input1 = _mm_unpackhi_epi32(input1, input3); 118 119 // Switch column2, column 3, and then, we got: 120 // input2: column1, column 0; input3: column2, column 3. 121 input1 = _mm_shuffle_epi32(input1, 0x4e); 122 input2 = _mm_add_epi16(input0, input1); 123 input3 = _mm_sub_epi16(input0, input1); 124 125 // Final round and shift 126 input2 = _mm_add_epi16(input2, eight); 127 input3 = _mm_add_epi16(input3, eight); 128 129 input2 = _mm_srai_epi16(input2, 4); 130 input3 = _mm_srai_epi16(input3, 4); 131 132#define RECON_AND_STORE4X4(dest, in_x) \ 133 { \ 134 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 135 d0 = _mm_unpacklo_epi8(d0, zero); \ 136 d0 = _mm_add_epi16(in_x, d0); \ 137 d0 = _mm_packus_epi16(d0, d0); \ 138 *(int *)dest = _mm_cvtsi128_si32(d0); \ 139 dest += stride; \ 140 } 141 142 input0 = _mm_srli_si128(input2, 8); 143 input1 = _mm_srli_si128(input3, 8); 144 145 RECON_AND_STORE4X4(dest, input2); 146 RECON_AND_STORE4X4(dest, input0); 147 RECON_AND_STORE4X4(dest, input1); 148 RECON_AND_STORE4X4(dest, input3); 149} 150 151void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 152 __m128i dc_value; 153 const __m128i zero = _mm_setzero_si128(); 154 int a; 155 156 a = dct_const_round_shift(input[0] * cospi_16_64); 157 a = dct_const_round_shift(a * cospi_16_64); 158 a = ROUND_POWER_OF_TWO(a, 4); 159 160 dc_value = _mm_set1_epi16(a); 161 162 RECON_AND_STORE4X4(dest, dc_value); 163 RECON_AND_STORE4X4(dest, dc_value); 164 RECON_AND_STORE4X4(dest, dc_value); 165 RECON_AND_STORE4X4(dest, dc_value); 166} 167 168static INLINE void transpose_4x4(__m128i *res) { 169 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 170 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); 171 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 172 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 173 174 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 175 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 176} 177 178static void idct4_1d_sse2(__m128i *in) { 179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 184 __m128i u[8], v[8]; 185 186 transpose_4x4(in); 187 // stage 1 188 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 189 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 190 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 191 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 192 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 193 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 194 195 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 196 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 197 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 198 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 199 200 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 201 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 202 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 203 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 204 205 u[0] = _mm_packs_epi32(v[0], v[2]); 206 u[1] = _mm_packs_epi32(v[1], v[3]); 207 u[2] = _mm_unpackhi_epi64(u[0], u[0]); 208 u[3] = _mm_unpackhi_epi64(u[1], u[1]); 209 210 // stage 2 211 in[0] = _mm_add_epi16(u[0], u[3]); 212 in[1] = _mm_add_epi16(u[1], u[2]); 213 in[2] = _mm_sub_epi16(u[1], u[2]); 214 in[3] = _mm_sub_epi16(u[0], u[3]); 215} 216 217static void iadst4_1d_sse2(__m128i *in) { 218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 223 const __m128i kZero = _mm_set1_epi16(0); 224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 225 __m128i u[8], v[8], in7; 226 227 transpose_4x4(in); 228 in7 = _mm_add_epi16(in[0], in[3]); 229 in7 = _mm_sub_epi16(in7, in[2]); 230 231 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 232 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 233 u[2] = _mm_unpacklo_epi16(in7, kZero); 234 u[3] = _mm_unpacklo_epi16(in[1], kZero); 235 236 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 237 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 238 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 239 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 240 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 241 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 242 243 u[0] = _mm_add_epi32(v[0], v[1]); 244 u[1] = _mm_add_epi32(v[3], v[4]); 245 u[2] = v[2]; 246 u[3] = _mm_add_epi32(u[0], u[1]); 247 u[4] = _mm_slli_epi32(v[5], 2); 248 u[5] = _mm_add_epi32(u[3], v[5]); 249 u[6] = _mm_sub_epi32(u[5], u[4]); 250 251 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 252 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 253 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 254 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 255 256 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 257 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 258 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 259 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 260 261 in[0] = _mm_packs_epi32(u[0], u[2]); 262 in[1] = _mm_packs_epi32(u[1], u[3]); 263 in[2] = _mm_unpackhi_epi64(in[0], in[0]); 264 in[3] = _mm_unpackhi_epi64(in[1], in[1]); 265} 266 267void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 268 int tx_type) { 269 __m128i in[4]; 270 const __m128i zero = _mm_setzero_si128(); 271 const __m128i eight = _mm_set1_epi16(8); 272 273 in[0] = _mm_loadl_epi64((const __m128i *)input); 274 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); 275 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); 276 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); 277 278 switch (tx_type) { 279 case 0: // DCT_DCT 280 idct4_1d_sse2(in); 281 idct4_1d_sse2(in); 282 break; 283 case 1: // ADST_DCT 284 idct4_1d_sse2(in); 285 iadst4_1d_sse2(in); 286 break; 287 case 2: // DCT_ADST 288 iadst4_1d_sse2(in); 289 idct4_1d_sse2(in); 290 break; 291 case 3: // ADST_ADST 292 iadst4_1d_sse2(in); 293 iadst4_1d_sse2(in); 294 break; 295 default: 296 assert(0); 297 break; 298 } 299 300 // Final round and shift 301 in[0] = _mm_add_epi16(in[0], eight); 302 in[1] = _mm_add_epi16(in[1], eight); 303 in[2] = _mm_add_epi16(in[2], eight); 304 in[3] = _mm_add_epi16(in[3], eight); 305 306 in[0] = _mm_srai_epi16(in[0], 4); 307 in[1] = _mm_srai_epi16(in[1], 4); 308 in[2] = _mm_srai_epi16(in[2], 4); 309 in[3] = _mm_srai_epi16(in[3], 4); 310 311 RECON_AND_STORE4X4(dest, in[0]); 312 RECON_AND_STORE4X4(dest, in[1]); 313 RECON_AND_STORE4X4(dest, in[2]); 314 RECON_AND_STORE4X4(dest, in[3]); 315} 316 317#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 318 out0, out1, out2, out3, out4, out5, out6, out7) \ 319 { \ 320 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 321 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 322 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 323 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 324 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 325 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 326 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 327 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 328 \ 329 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 330 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 331 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 332 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 333 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 334 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 335 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 336 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 337 \ 338 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 339 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 340 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 341 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 342 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 343 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 344 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 345 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 346 } 347 348#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 349 out0, out1, out2, out3, out4, out5, out6, out7) \ 350 { \ 351 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 352 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 353 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 354 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 355 \ 356 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 357 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 358 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 359 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 360 \ 361 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 362 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 363 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 364 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 365 out4 = out5 = out6 = out7 = zero; \ 366 } 367 368#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ 369 { \ 370 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 371 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 372 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 373 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 374 \ 375 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 376 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 377 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ 378 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ 379 } 380 381// Define Macro for multiplying elements by constants and adding them together. 382#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 383 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 384 { \ 385 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 386 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 387 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 388 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 389 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 390 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 391 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 392 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 393 \ 394 tmp0 = _mm_add_epi32(tmp0, rounding); \ 395 tmp1 = _mm_add_epi32(tmp1, rounding); \ 396 tmp2 = _mm_add_epi32(tmp2, rounding); \ 397 tmp3 = _mm_add_epi32(tmp3, rounding); \ 398 tmp4 = _mm_add_epi32(tmp4, rounding); \ 399 tmp5 = _mm_add_epi32(tmp5, rounding); \ 400 tmp6 = _mm_add_epi32(tmp6, rounding); \ 401 tmp7 = _mm_add_epi32(tmp7, rounding); \ 402 \ 403 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 404 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 405 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 406 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 407 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 408 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 409 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 410 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 411 \ 412 res0 = _mm_packs_epi32(tmp0, tmp1); \ 413 res1 = _mm_packs_epi32(tmp2, tmp3); \ 414 res2 = _mm_packs_epi32(tmp4, tmp5); \ 415 res3 = _mm_packs_epi32(tmp6, tmp7); \ 416 } 417 418#define IDCT8_1D \ 419 /* Stage1 */ \ 420 { \ 421 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 422 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 423 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 424 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 425 \ 426 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 427 stg1_1, stg1_2, stg1_3, stp1_4, \ 428 stp1_7, stp1_5, stp1_6) \ 429 } \ 430 \ 431 /* Stage2 */ \ 432 { \ 433 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 434 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 435 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 436 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 437 \ 438 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 439 stg2_1, stg2_2, stg2_3, stp2_0, \ 440 stp2_1, stp2_2, stp2_3) \ 441 \ 442 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 443 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 444 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 445 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 446 } \ 447 \ 448 /* Stage3 */ \ 449 { \ 450 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 451 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 452 \ 453 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 454 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 455 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 456 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 457 \ 458 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 459 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 460 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 461 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 462 \ 463 tmp0 = _mm_add_epi32(tmp0, rounding); \ 464 tmp1 = _mm_add_epi32(tmp1, rounding); \ 465 tmp2 = _mm_add_epi32(tmp2, rounding); \ 466 tmp3 = _mm_add_epi32(tmp3, rounding); \ 467 \ 468 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 469 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 470 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 471 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 472 \ 473 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 474 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 475 } \ 476 \ 477 /* Stage4 */ \ 478 in0 = _mm_adds_epi16(stp1_0, stp2_7); \ 479 in1 = _mm_adds_epi16(stp1_1, stp1_6); \ 480 in2 = _mm_adds_epi16(stp1_2, stp1_5); \ 481 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 482 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 483 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 484 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 485 in7 = _mm_subs_epi16(stp1_0, stp2_7); 486 487#define RECON_AND_STORE(dest, in_x) \ 488 { \ 489 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 490 d0 = _mm_unpacklo_epi8(d0, zero); \ 491 d0 = _mm_add_epi16(in_x, d0); \ 492 d0 = _mm_packus_epi16(d0, d0); \ 493 _mm_storel_epi64((__m128i *)(dest), d0); \ 494 dest += stride; \ 495 } 496 497void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 498 const __m128i zero = _mm_setzero_si128(); 499 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 500 const __m128i final_rounding = _mm_set1_epi16(1<<4); 501 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 502 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 503 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 504 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 505 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 506 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 507 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 508 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 509 510 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 511 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 512 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 513 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 514 int i; 515 516 // Load input data. 517 in0 = _mm_load_si128((const __m128i *)input); 518 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 519 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 520 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 521 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 522 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 523 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 524 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 525 526 // 2-D 527 for (i = 0; i < 2; i++) { 528 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 529 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 530 in4, in5, in6, in7); 531 532 // 4-stage 1D idct8x8 533 IDCT8_1D 534 } 535 536 // Final rounding and shift 537 in0 = _mm_adds_epi16(in0, final_rounding); 538 in1 = _mm_adds_epi16(in1, final_rounding); 539 in2 = _mm_adds_epi16(in2, final_rounding); 540 in3 = _mm_adds_epi16(in3, final_rounding); 541 in4 = _mm_adds_epi16(in4, final_rounding); 542 in5 = _mm_adds_epi16(in5, final_rounding); 543 in6 = _mm_adds_epi16(in6, final_rounding); 544 in7 = _mm_adds_epi16(in7, final_rounding); 545 546 in0 = _mm_srai_epi16(in0, 5); 547 in1 = _mm_srai_epi16(in1, 5); 548 in2 = _mm_srai_epi16(in2, 5); 549 in3 = _mm_srai_epi16(in3, 5); 550 in4 = _mm_srai_epi16(in4, 5); 551 in5 = _mm_srai_epi16(in5, 5); 552 in6 = _mm_srai_epi16(in6, 5); 553 in7 = _mm_srai_epi16(in7, 5); 554 555 RECON_AND_STORE(dest, in0); 556 RECON_AND_STORE(dest, in1); 557 RECON_AND_STORE(dest, in2); 558 RECON_AND_STORE(dest, in3); 559 RECON_AND_STORE(dest, in4); 560 RECON_AND_STORE(dest, in5); 561 RECON_AND_STORE(dest, in6); 562 RECON_AND_STORE(dest, in7); 563} 564 565void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 566 __m128i dc_value; 567 const __m128i zero = _mm_setzero_si128(); 568 int a; 569 570 a = dct_const_round_shift(input[0] * cospi_16_64); 571 a = dct_const_round_shift(a * cospi_16_64); 572 a = ROUND_POWER_OF_TWO(a, 5); 573 574 dc_value = _mm_set1_epi16(a); 575 576 RECON_AND_STORE(dest, dc_value); 577 RECON_AND_STORE(dest, dc_value); 578 RECON_AND_STORE(dest, dc_value); 579 RECON_AND_STORE(dest, dc_value); 580 RECON_AND_STORE(dest, dc_value); 581 RECON_AND_STORE(dest, dc_value); 582 RECON_AND_STORE(dest, dc_value); 583 RECON_AND_STORE(dest, dc_value); 584} 585 586// perform 8x8 transpose 587static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 588 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 589 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 590 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 591 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 592 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 593 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 594 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 595 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 596 597 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 598 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 599 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 600 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 601 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 602 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 603 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 604 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 605 606 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 607 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 608 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 609 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 610 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 611 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 612 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 613 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 614} 615 616static void idct8_1d_sse2(__m128i *in) { 617 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 618 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 619 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 620 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 621 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 622 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 623 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 624 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 625 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 626 627 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 628 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 629 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 630 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 631 632 in0 = in[0]; 633 in1 = in[1]; 634 in2 = in[2]; 635 in3 = in[3]; 636 in4 = in[4]; 637 in5 = in[5]; 638 in6 = in[6]; 639 in7 = in[7]; 640 641 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 642 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 643 in4, in5, in6, in7); 644 645 // 4-stage 1D idct8x8 646 IDCT8_1D 647 in[0] = in0; 648 in[1] = in1; 649 in[2] = in2; 650 in[3] = in3; 651 in[4] = in4; 652 in[5] = in5; 653 in[6] = in6; 654 in[7] = in7; 655} 656 657static void iadst8_1d_sse2(__m128i *in) { 658 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 659 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 660 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 661 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 662 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 663 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 664 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 665 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 666 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 667 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 668 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 669 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 670 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 671 const __m128i k__const_0 = _mm_set1_epi16(0); 672 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 673 674 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 675 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 676 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 677 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 678 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 679 680 // transpose 681 array_transpose_8x8(in, in); 682 683 // properly aligned for butterfly input 684 in0 = in[7]; 685 in1 = in[0]; 686 in2 = in[5]; 687 in3 = in[2]; 688 in4 = in[3]; 689 in5 = in[4]; 690 in6 = in[1]; 691 in7 = in[6]; 692 693 // column transformation 694 // stage 1 695 // interleave and multiply/add into 32-bit integer 696 s0 = _mm_unpacklo_epi16(in0, in1); 697 s1 = _mm_unpackhi_epi16(in0, in1); 698 s2 = _mm_unpacklo_epi16(in2, in3); 699 s3 = _mm_unpackhi_epi16(in2, in3); 700 s4 = _mm_unpacklo_epi16(in4, in5); 701 s5 = _mm_unpackhi_epi16(in4, in5); 702 s6 = _mm_unpacklo_epi16(in6, in7); 703 s7 = _mm_unpackhi_epi16(in6, in7); 704 705 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 706 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 707 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 708 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 709 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 710 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 711 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 712 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 713 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 714 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 715 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 716 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 717 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 718 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 719 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 720 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 721 722 // addition 723 w0 = _mm_add_epi32(u0, u8); 724 w1 = _mm_add_epi32(u1, u9); 725 w2 = _mm_add_epi32(u2, u10); 726 w3 = _mm_add_epi32(u3, u11); 727 w4 = _mm_add_epi32(u4, u12); 728 w5 = _mm_add_epi32(u5, u13); 729 w6 = _mm_add_epi32(u6, u14); 730 w7 = _mm_add_epi32(u7, u15); 731 w8 = _mm_sub_epi32(u0, u8); 732 w9 = _mm_sub_epi32(u1, u9); 733 w10 = _mm_sub_epi32(u2, u10); 734 w11 = _mm_sub_epi32(u3, u11); 735 w12 = _mm_sub_epi32(u4, u12); 736 w13 = _mm_sub_epi32(u5, u13); 737 w14 = _mm_sub_epi32(u6, u14); 738 w15 = _mm_sub_epi32(u7, u15); 739 740 // shift and rounding 741 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 742 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 743 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 744 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 745 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 746 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 747 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 748 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 749 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 750 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 751 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 752 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 753 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 754 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 755 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 756 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 757 758 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 759 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 760 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 761 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 762 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 763 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 764 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 765 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 766 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 767 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 768 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 769 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 770 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 771 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 772 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 773 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 774 775 // back to 16-bit and pack 8 integers into __m128i 776 in[0] = _mm_packs_epi32(u0, u1); 777 in[1] = _mm_packs_epi32(u2, u3); 778 in[2] = _mm_packs_epi32(u4, u5); 779 in[3] = _mm_packs_epi32(u6, u7); 780 in[4] = _mm_packs_epi32(u8, u9); 781 in[5] = _mm_packs_epi32(u10, u11); 782 in[6] = _mm_packs_epi32(u12, u13); 783 in[7] = _mm_packs_epi32(u14, u15); 784 785 // stage 2 786 s0 = _mm_add_epi16(in[0], in[2]); 787 s1 = _mm_add_epi16(in[1], in[3]); 788 s2 = _mm_sub_epi16(in[0], in[2]); 789 s3 = _mm_sub_epi16(in[1], in[3]); 790 u0 = _mm_unpacklo_epi16(in[4], in[5]); 791 u1 = _mm_unpackhi_epi16(in[4], in[5]); 792 u2 = _mm_unpacklo_epi16(in[6], in[7]); 793 u3 = _mm_unpackhi_epi16(in[6], in[7]); 794 795 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 796 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 797 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 798 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 799 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 800 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 801 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 802 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 803 804 w0 = _mm_add_epi32(v0, v4); 805 w1 = _mm_add_epi32(v1, v5); 806 w2 = _mm_add_epi32(v2, v6); 807 w3 = _mm_add_epi32(v3, v7); 808 w4 = _mm_sub_epi32(v0, v4); 809 w5 = _mm_sub_epi32(v1, v5); 810 w6 = _mm_sub_epi32(v2, v6); 811 w7 = _mm_sub_epi32(v3, v7); 812 813 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 814 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 815 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 816 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 817 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 818 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 819 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 820 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 821 822 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 823 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 824 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 825 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 826 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 827 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 828 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 829 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 830 831 // back to 16-bit intergers 832 s4 = _mm_packs_epi32(u0, u1); 833 s5 = _mm_packs_epi32(u2, u3); 834 s6 = _mm_packs_epi32(u4, u5); 835 s7 = _mm_packs_epi32(u6, u7); 836 837 // stage 3 838 u0 = _mm_unpacklo_epi16(s2, s3); 839 u1 = _mm_unpackhi_epi16(s2, s3); 840 u2 = _mm_unpacklo_epi16(s6, s7); 841 u3 = _mm_unpackhi_epi16(s6, s7); 842 843 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 844 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 845 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 846 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 847 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 848 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 849 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 850 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 851 852 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 853 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 854 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 855 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 856 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 857 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 858 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 859 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 860 861 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 862 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 863 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 864 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 865 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 866 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 867 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 868 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 869 870 s2 = _mm_packs_epi32(v0, v1); 871 s3 = _mm_packs_epi32(v2, v3); 872 s6 = _mm_packs_epi32(v4, v5); 873 s7 = _mm_packs_epi32(v6, v7); 874 875 in[0] = s0; 876 in[1] = _mm_sub_epi16(k__const_0, s4); 877 in[2] = s6; 878 in[3] = _mm_sub_epi16(k__const_0, s2); 879 in[4] = s3; 880 in[5] = _mm_sub_epi16(k__const_0, s7); 881 in[6] = s5; 882 in[7] = _mm_sub_epi16(k__const_0, s1); 883} 884 885 886void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 887 int tx_type) { 888 __m128i in[8]; 889 const __m128i zero = _mm_setzero_si128(); 890 const __m128i final_rounding = _mm_set1_epi16(1<<4); 891 892 // load input data 893 in[0] = _mm_load_si128((const __m128i *)input); 894 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 895 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 896 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 897 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 898 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 899 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 900 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 901 902 switch (tx_type) { 903 case 0: // DCT_DCT 904 idct8_1d_sse2(in); 905 idct8_1d_sse2(in); 906 break; 907 case 1: // ADST_DCT 908 idct8_1d_sse2(in); 909 iadst8_1d_sse2(in); 910 break; 911 case 2: // DCT_ADST 912 iadst8_1d_sse2(in); 913 idct8_1d_sse2(in); 914 break; 915 case 3: // ADST_ADST 916 iadst8_1d_sse2(in); 917 iadst8_1d_sse2(in); 918 break; 919 default: 920 assert(0); 921 break; 922 } 923 924 // Final rounding and shift 925 in[0] = _mm_adds_epi16(in[0], final_rounding); 926 in[1] = _mm_adds_epi16(in[1], final_rounding); 927 in[2] = _mm_adds_epi16(in[2], final_rounding); 928 in[3] = _mm_adds_epi16(in[3], final_rounding); 929 in[4] = _mm_adds_epi16(in[4], final_rounding); 930 in[5] = _mm_adds_epi16(in[5], final_rounding); 931 in[6] = _mm_adds_epi16(in[6], final_rounding); 932 in[7] = _mm_adds_epi16(in[7], final_rounding); 933 934 in[0] = _mm_srai_epi16(in[0], 5); 935 in[1] = _mm_srai_epi16(in[1], 5); 936 in[2] = _mm_srai_epi16(in[2], 5); 937 in[3] = _mm_srai_epi16(in[3], 5); 938 in[4] = _mm_srai_epi16(in[4], 5); 939 in[5] = _mm_srai_epi16(in[5], 5); 940 in[6] = _mm_srai_epi16(in[6], 5); 941 in[7] = _mm_srai_epi16(in[7], 5); 942 943 RECON_AND_STORE(dest, in[0]); 944 RECON_AND_STORE(dest, in[1]); 945 RECON_AND_STORE(dest, in[2]); 946 RECON_AND_STORE(dest, in[3]); 947 RECON_AND_STORE(dest, in[4]); 948 RECON_AND_STORE(dest, in[5]); 949 RECON_AND_STORE(dest, in[6]); 950 RECON_AND_STORE(dest, in[7]); 951} 952 953void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 954 const __m128i zero = _mm_setzero_si128(); 955 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 956 const __m128i final_rounding = _mm_set1_epi16(1<<4); 957 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 958 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 959 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 960 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 961 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 962 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 963 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 964 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 965 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 966 967 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 968 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 969 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 970 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 971 972 // Rows. Load 4-row input data. 973 in0 = _mm_load_si128((const __m128i *)input); 974 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 975 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 976 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 977 978 // 8x4 Transpose 979 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) 980 981 // Stage1 982 { //NOLINT 983 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); 984 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); 985 986 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 987 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 988 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 989 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 990 991 tmp0 = _mm_add_epi32(tmp0, rounding); 992 tmp2 = _mm_add_epi32(tmp2, rounding); 993 tmp4 = _mm_add_epi32(tmp4, rounding); 994 tmp6 = _mm_add_epi32(tmp6, rounding); 995 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 996 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 997 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 998 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 999 1000 stp1_4 = _mm_packs_epi32(tmp0, zero); 1001 stp1_7 = _mm_packs_epi32(tmp2, zero); 1002 stp1_5 = _mm_packs_epi32(tmp4, zero); 1003 stp1_6 = _mm_packs_epi32(tmp6, zero); 1004 } 1005 1006 // Stage2 1007 { //NOLINT 1008 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); 1009 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); 1010 1011 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1012 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1013 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1014 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1015 1016 tmp0 = _mm_add_epi32(tmp0, rounding); 1017 tmp2 = _mm_add_epi32(tmp2, rounding); 1018 tmp4 = _mm_add_epi32(tmp4, rounding); 1019 tmp6 = _mm_add_epi32(tmp6, rounding); 1020 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1021 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1022 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1023 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1024 1025 stp2_0 = _mm_packs_epi32(tmp0, zero); 1026 stp2_1 = _mm_packs_epi32(tmp2, zero); 1027 stp2_2 = _mm_packs_epi32(tmp4, zero); 1028 stp2_3 = _mm_packs_epi32(tmp6, zero); 1029 1030 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); 1031 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); 1032 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); 1033 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); 1034 } 1035 1036 // Stage3 1037 { //NOLINT 1038 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1039 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); 1040 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); 1041 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); 1042 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); 1043 1044 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1045 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1046 1047 tmp0 = _mm_add_epi32(tmp0, rounding); 1048 tmp2 = _mm_add_epi32(tmp2, rounding); 1049 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1050 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1051 1052 stp1_5 = _mm_packs_epi32(tmp0, zero); 1053 stp1_6 = _mm_packs_epi32(tmp2, zero); 1054 } 1055 1056 // Stage4 1057 in0 = _mm_adds_epi16(stp1_0, stp2_7); 1058 in1 = _mm_adds_epi16(stp1_1, stp1_6); 1059 in2 = _mm_adds_epi16(stp1_2, stp1_5); 1060 in3 = _mm_adds_epi16(stp1_3, stp2_4); 1061 in4 = _mm_subs_epi16(stp1_3, stp2_4); 1062 in5 = _mm_subs_epi16(stp1_2, stp1_5); 1063 in6 = _mm_subs_epi16(stp1_1, stp1_6); 1064 in7 = _mm_subs_epi16(stp1_0, stp2_7); 1065 1066 // Columns. 4x8 Transpose 1067 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1068 in4, in5, in6, in7) 1069 1070 // 1D idct8x8 1071 IDCT8_1D 1072 1073 // Final rounding and shift 1074 in0 = _mm_adds_epi16(in0, final_rounding); 1075 in1 = _mm_adds_epi16(in1, final_rounding); 1076 in2 = _mm_adds_epi16(in2, final_rounding); 1077 in3 = _mm_adds_epi16(in3, final_rounding); 1078 in4 = _mm_adds_epi16(in4, final_rounding); 1079 in5 = _mm_adds_epi16(in5, final_rounding); 1080 in6 = _mm_adds_epi16(in6, final_rounding); 1081 in7 = _mm_adds_epi16(in7, final_rounding); 1082 1083 in0 = _mm_srai_epi16(in0, 5); 1084 in1 = _mm_srai_epi16(in1, 5); 1085 in2 = _mm_srai_epi16(in2, 5); 1086 in3 = _mm_srai_epi16(in3, 5); 1087 in4 = _mm_srai_epi16(in4, 5); 1088 in5 = _mm_srai_epi16(in5, 5); 1089 in6 = _mm_srai_epi16(in6, 5); 1090 in7 = _mm_srai_epi16(in7, 5); 1091 1092 RECON_AND_STORE(dest, in0); 1093 RECON_AND_STORE(dest, in1); 1094 RECON_AND_STORE(dest, in2); 1095 RECON_AND_STORE(dest, in3); 1096 RECON_AND_STORE(dest, in4); 1097 RECON_AND_STORE(dest, in5); 1098 RECON_AND_STORE(dest, in6); 1099 RECON_AND_STORE(dest, in7); 1100} 1101 1102#define IDCT16_1D \ 1103 /* Stage2 */ \ 1104 { \ 1105 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 1106 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 1107 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 1108 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 1109 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 1110 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 1111 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 1112 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 1113 \ 1114 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1115 stg2_0, stg2_1, stg2_2, stg2_3, \ 1116 stp2_8, stp2_15, stp2_9, stp2_14) \ 1117 \ 1118 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1119 stg2_4, stg2_5, stg2_6, stg2_7, \ 1120 stp2_10, stp2_13, stp2_11, stp2_12) \ 1121 } \ 1122 \ 1123 /* Stage3 */ \ 1124 { \ 1125 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 1126 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 1127 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 1128 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 1129 \ 1130 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1131 stg3_0, stg3_1, stg3_2, stg3_3, \ 1132 stp1_4, stp1_7, stp1_5, stp1_6) \ 1133 \ 1134 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1135 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1136 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1137 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1138 \ 1139 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1140 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1141 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1142 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1143 } \ 1144 \ 1145 /* Stage4 */ \ 1146 { \ 1147 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 1148 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 1149 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 1150 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 1151 \ 1152 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1153 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1154 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1155 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1156 \ 1157 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1158 stg4_0, stg4_1, stg4_2, stg4_3, \ 1159 stp2_0, stp2_1, stp2_2, stp2_3) \ 1160 \ 1161 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1162 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1163 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1164 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1165 \ 1166 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1167 stg4_4, stg4_5, stg4_6, stg4_7, \ 1168 stp2_9, stp2_14, stp2_10, stp2_13) \ 1169 } \ 1170 \ 1171 /* Stage5 */ \ 1172 { \ 1173 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1174 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1175 \ 1176 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1177 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1178 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1179 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1180 \ 1181 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1182 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1183 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1184 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1185 \ 1186 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1187 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1188 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1189 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1190 \ 1191 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1192 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1193 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1194 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1195 \ 1196 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1197 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1198 \ 1199 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1200 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1201 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1202 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1203 \ 1204 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1205 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1206 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1207 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1208 } \ 1209 \ 1210 /* Stage6 */ \ 1211 { \ 1212 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1213 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1214 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1215 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1216 \ 1217 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1218 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1219 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1220 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1221 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1222 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1223 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1224 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1225 \ 1226 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1227 stg6_0, stg4_0, stg6_0, stg4_0, \ 1228 stp2_10, stp2_13, stp2_11, stp2_12) \ 1229 } 1230 1231void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1232 int stride) { 1233 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1234 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1235 const __m128i zero = _mm_setzero_si128(); 1236 1237 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1238 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1239 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1240 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1241 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1242 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1243 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1244 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1245 1246 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1247 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1248 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1249 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1250 1251 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1252 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1253 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1254 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1255 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1256 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1257 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1258 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1259 1260 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1261 1262 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1263 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1264 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1265 in14 = zero, in15 = zero; 1266 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1267 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1268 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1269 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, 1270 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, 1271 r12 = zero, r13 = zero, r14 = zero, r15 = zero; 1272 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1273 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1274 stp1_8_0, stp1_12_0; 1275 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1276 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1277 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1278 int i; 1279 1280 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 1281 for (i = 0; i < 4; i++) { 1282 // 1-D idct 1283 if (i < 2) { 1284 if (i == 1) input += 128; 1285 1286 // Load input data. 1287 in0 = _mm_load_si128((const __m128i *)input); 1288 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1289 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1290 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1291 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1292 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1293 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1294 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1295 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1296 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1297 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1298 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1299 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1300 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1301 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1302 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1303 1304 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1305 in4, in5, in6, in7); 1306 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1307 in10, in11, in12, in13, in14, in15); 1308 } 1309 1310 if (i == 2) { 1311 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1312 in5, in6, in7); 1313 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, 1314 in13, in14, in15); 1315 } 1316 1317 if (i == 3) { 1318 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1319 in4, in5, in6, in7); 1320 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 1321 in12, in13, in14, in15); 1322 } 1323 1324 IDCT16_1D 1325 1326 // Stage7 1327 if (i == 0) { 1328 // Left 8x16 1329 l0 = _mm_add_epi16(stp2_0, stp1_15); 1330 l1 = _mm_add_epi16(stp2_1, stp1_14); 1331 l2 = _mm_add_epi16(stp2_2, stp2_13); 1332 l3 = _mm_add_epi16(stp2_3, stp2_12); 1333 l4 = _mm_add_epi16(stp2_4, stp2_11); 1334 l5 = _mm_add_epi16(stp2_5, stp2_10); 1335 l6 = _mm_add_epi16(stp2_6, stp1_9); 1336 l7 = _mm_add_epi16(stp2_7, stp1_8); 1337 l8 = _mm_sub_epi16(stp2_7, stp1_8); 1338 l9 = _mm_sub_epi16(stp2_6, stp1_9); 1339 l10 = _mm_sub_epi16(stp2_5, stp2_10); 1340 l11 = _mm_sub_epi16(stp2_4, stp2_11); 1341 l12 = _mm_sub_epi16(stp2_3, stp2_12); 1342 l13 = _mm_sub_epi16(stp2_2, stp2_13); 1343 l14 = _mm_sub_epi16(stp2_1, stp1_14); 1344 l15 = _mm_sub_epi16(stp2_0, stp1_15); 1345 } else if (i == 1) { 1346 // Right 8x16 1347 r0 = _mm_add_epi16(stp2_0, stp1_15); 1348 r1 = _mm_add_epi16(stp2_1, stp1_14); 1349 r2 = _mm_add_epi16(stp2_2, stp2_13); 1350 r3 = _mm_add_epi16(stp2_3, stp2_12); 1351 r4 = _mm_add_epi16(stp2_4, stp2_11); 1352 r5 = _mm_add_epi16(stp2_5, stp2_10); 1353 r6 = _mm_add_epi16(stp2_6, stp1_9); 1354 r7 = _mm_add_epi16(stp2_7, stp1_8); 1355 r8 = _mm_sub_epi16(stp2_7, stp1_8); 1356 r9 = _mm_sub_epi16(stp2_6, stp1_9); 1357 r10 = _mm_sub_epi16(stp2_5, stp2_10); 1358 r11 = _mm_sub_epi16(stp2_4, stp2_11); 1359 r12 = _mm_sub_epi16(stp2_3, stp2_12); 1360 r13 = _mm_sub_epi16(stp2_2, stp2_13); 1361 r14 = _mm_sub_epi16(stp2_1, stp1_14); 1362 r15 = _mm_sub_epi16(stp2_0, stp1_15); 1363 } else { 1364 // 2-D 1365 in0 = _mm_add_epi16(stp2_0, stp1_15); 1366 in1 = _mm_add_epi16(stp2_1, stp1_14); 1367 in2 = _mm_add_epi16(stp2_2, stp2_13); 1368 in3 = _mm_add_epi16(stp2_3, stp2_12); 1369 in4 = _mm_add_epi16(stp2_4, stp2_11); 1370 in5 = _mm_add_epi16(stp2_5, stp2_10); 1371 in6 = _mm_add_epi16(stp2_6, stp1_9); 1372 in7 = _mm_add_epi16(stp2_7, stp1_8); 1373 in8 = _mm_sub_epi16(stp2_7, stp1_8); 1374 in9 = _mm_sub_epi16(stp2_6, stp1_9); 1375 in10 = _mm_sub_epi16(stp2_5, stp2_10); 1376 in11 = _mm_sub_epi16(stp2_4, stp2_11); 1377 in12 = _mm_sub_epi16(stp2_3, stp2_12); 1378 in13 = _mm_sub_epi16(stp2_2, stp2_13); 1379 in14 = _mm_sub_epi16(stp2_1, stp1_14); 1380 in15 = _mm_sub_epi16(stp2_0, stp1_15); 1381 1382 // Final rounding and shift 1383 in0 = _mm_adds_epi16(in0, final_rounding); 1384 in1 = _mm_adds_epi16(in1, final_rounding); 1385 in2 = _mm_adds_epi16(in2, final_rounding); 1386 in3 = _mm_adds_epi16(in3, final_rounding); 1387 in4 = _mm_adds_epi16(in4, final_rounding); 1388 in5 = _mm_adds_epi16(in5, final_rounding); 1389 in6 = _mm_adds_epi16(in6, final_rounding); 1390 in7 = _mm_adds_epi16(in7, final_rounding); 1391 in8 = _mm_adds_epi16(in8, final_rounding); 1392 in9 = _mm_adds_epi16(in9, final_rounding); 1393 in10 = _mm_adds_epi16(in10, final_rounding); 1394 in11 = _mm_adds_epi16(in11, final_rounding); 1395 in12 = _mm_adds_epi16(in12, final_rounding); 1396 in13 = _mm_adds_epi16(in13, final_rounding); 1397 in14 = _mm_adds_epi16(in14, final_rounding); 1398 in15 = _mm_adds_epi16(in15, final_rounding); 1399 1400 in0 = _mm_srai_epi16(in0, 6); 1401 in1 = _mm_srai_epi16(in1, 6); 1402 in2 = _mm_srai_epi16(in2, 6); 1403 in3 = _mm_srai_epi16(in3, 6); 1404 in4 = _mm_srai_epi16(in4, 6); 1405 in5 = _mm_srai_epi16(in5, 6); 1406 in6 = _mm_srai_epi16(in6, 6); 1407 in7 = _mm_srai_epi16(in7, 6); 1408 in8 = _mm_srai_epi16(in8, 6); 1409 in9 = _mm_srai_epi16(in9, 6); 1410 in10 = _mm_srai_epi16(in10, 6); 1411 in11 = _mm_srai_epi16(in11, 6); 1412 in12 = _mm_srai_epi16(in12, 6); 1413 in13 = _mm_srai_epi16(in13, 6); 1414 in14 = _mm_srai_epi16(in14, 6); 1415 in15 = _mm_srai_epi16(in15, 6); 1416 1417 RECON_AND_STORE(dest, in0); 1418 RECON_AND_STORE(dest, in1); 1419 RECON_AND_STORE(dest, in2); 1420 RECON_AND_STORE(dest, in3); 1421 RECON_AND_STORE(dest, in4); 1422 RECON_AND_STORE(dest, in5); 1423 RECON_AND_STORE(dest, in6); 1424 RECON_AND_STORE(dest, in7); 1425 RECON_AND_STORE(dest, in8); 1426 RECON_AND_STORE(dest, in9); 1427 RECON_AND_STORE(dest, in10); 1428 RECON_AND_STORE(dest, in11); 1429 RECON_AND_STORE(dest, in12); 1430 RECON_AND_STORE(dest, in13); 1431 RECON_AND_STORE(dest, in14); 1432 RECON_AND_STORE(dest, in15); 1433 1434 dest += 8 - (stride * 16); 1435 } 1436 } 1437} 1438 1439void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1440 __m128i dc_value; 1441 const __m128i zero = _mm_setzero_si128(); 1442 int a, i; 1443 1444 a = dct_const_round_shift(input[0] * cospi_16_64); 1445 a = dct_const_round_shift(a * cospi_16_64); 1446 a = ROUND_POWER_OF_TWO(a, 6); 1447 1448 dc_value = _mm_set1_epi16(a); 1449 1450 for (i = 0; i < 2; ++i) { 1451 RECON_AND_STORE(dest, dc_value); 1452 RECON_AND_STORE(dest, dc_value); 1453 RECON_AND_STORE(dest, dc_value); 1454 RECON_AND_STORE(dest, dc_value); 1455 RECON_AND_STORE(dest, dc_value); 1456 RECON_AND_STORE(dest, dc_value); 1457 RECON_AND_STORE(dest, dc_value); 1458 RECON_AND_STORE(dest, dc_value); 1459 RECON_AND_STORE(dest, dc_value); 1460 RECON_AND_STORE(dest, dc_value); 1461 RECON_AND_STORE(dest, dc_value); 1462 RECON_AND_STORE(dest, dc_value); 1463 RECON_AND_STORE(dest, dc_value); 1464 RECON_AND_STORE(dest, dc_value); 1465 RECON_AND_STORE(dest, dc_value); 1466 RECON_AND_STORE(dest, dc_value); 1467 dest += 8 - (stride * 16); 1468 } 1469} 1470 1471static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1472 __m128i tbuf[8]; 1473 array_transpose_8x8(res0, res0); 1474 array_transpose_8x8(res1, tbuf); 1475 array_transpose_8x8(res0 + 8, res1); 1476 array_transpose_8x8(res1 + 8, res1 + 8); 1477 1478 res0[8] = tbuf[0]; 1479 res0[9] = tbuf[1]; 1480 res0[10] = tbuf[2]; 1481 res0[11] = tbuf[3]; 1482 res0[12] = tbuf[4]; 1483 res0[13] = tbuf[5]; 1484 res0[14] = tbuf[6]; 1485 res0[15] = tbuf[7]; 1486} 1487 1488static void iadst16_1d_8col(__m128i *in) { 1489 // perform 16x16 1-D ADST for 8 columns 1490 __m128i s[16], x[16], u[32], v[32]; 1491 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1492 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1493 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1494 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1495 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1496 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1497 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1498 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1499 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1500 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1501 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1502 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1503 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1504 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1505 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1506 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1507 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1508 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1509 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1510 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1511 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1512 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1513 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1514 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1515 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1516 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1517 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1518 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1519 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1520 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1521 const __m128i kZero = _mm_set1_epi16(0); 1522 1523 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1524 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1525 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1526 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1527 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1528 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1529 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1530 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1531 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1532 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1533 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1534 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1535 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1536 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1537 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1538 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1539 1540 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1541 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1542 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1543 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1544 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1545 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1546 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1547 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1548 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1549 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1550 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1551 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1552 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1553 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1554 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1555 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1556 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1557 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1558 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1559 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1560 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1561 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1562 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1563 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1564 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1565 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1566 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1567 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1568 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1569 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1570 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1571 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1572 1573 u[0] = _mm_add_epi32(v[0], v[16]); 1574 u[1] = _mm_add_epi32(v[1], v[17]); 1575 u[2] = _mm_add_epi32(v[2], v[18]); 1576 u[3] = _mm_add_epi32(v[3], v[19]); 1577 u[4] = _mm_add_epi32(v[4], v[20]); 1578 u[5] = _mm_add_epi32(v[5], v[21]); 1579 u[6] = _mm_add_epi32(v[6], v[22]); 1580 u[7] = _mm_add_epi32(v[7], v[23]); 1581 u[8] = _mm_add_epi32(v[8], v[24]); 1582 u[9] = _mm_add_epi32(v[9], v[25]); 1583 u[10] = _mm_add_epi32(v[10], v[26]); 1584 u[11] = _mm_add_epi32(v[11], v[27]); 1585 u[12] = _mm_add_epi32(v[12], v[28]); 1586 u[13] = _mm_add_epi32(v[13], v[29]); 1587 u[14] = _mm_add_epi32(v[14], v[30]); 1588 u[15] = _mm_add_epi32(v[15], v[31]); 1589 u[16] = _mm_sub_epi32(v[0], v[16]); 1590 u[17] = _mm_sub_epi32(v[1], v[17]); 1591 u[18] = _mm_sub_epi32(v[2], v[18]); 1592 u[19] = _mm_sub_epi32(v[3], v[19]); 1593 u[20] = _mm_sub_epi32(v[4], v[20]); 1594 u[21] = _mm_sub_epi32(v[5], v[21]); 1595 u[22] = _mm_sub_epi32(v[6], v[22]); 1596 u[23] = _mm_sub_epi32(v[7], v[23]); 1597 u[24] = _mm_sub_epi32(v[8], v[24]); 1598 u[25] = _mm_sub_epi32(v[9], v[25]); 1599 u[26] = _mm_sub_epi32(v[10], v[26]); 1600 u[27] = _mm_sub_epi32(v[11], v[27]); 1601 u[28] = _mm_sub_epi32(v[12], v[28]); 1602 u[29] = _mm_sub_epi32(v[13], v[29]); 1603 u[30] = _mm_sub_epi32(v[14], v[30]); 1604 u[31] = _mm_sub_epi32(v[15], v[31]); 1605 1606 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1607 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1608 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1609 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1610 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1611 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1612 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1613 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1614 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1615 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1616 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1617 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1618 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1619 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1620 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1621 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1622 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1623 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1624 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1625 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1626 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1627 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1628 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1629 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1630 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1631 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1632 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1633 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1634 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1635 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1636 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1637 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1638 1639 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1640 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1641 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1642 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1643 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1644 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1645 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1646 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1647 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1648 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1649 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1650 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1651 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1652 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1653 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1654 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1655 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1656 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1657 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1658 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1659 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1660 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1661 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1662 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1663 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1664 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1665 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1666 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1667 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1668 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1669 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1670 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1671 1672 s[0] = _mm_packs_epi32(u[0], u[1]); 1673 s[1] = _mm_packs_epi32(u[2], u[3]); 1674 s[2] = _mm_packs_epi32(u[4], u[5]); 1675 s[3] = _mm_packs_epi32(u[6], u[7]); 1676 s[4] = _mm_packs_epi32(u[8], u[9]); 1677 s[5] = _mm_packs_epi32(u[10], u[11]); 1678 s[6] = _mm_packs_epi32(u[12], u[13]); 1679 s[7] = _mm_packs_epi32(u[14], u[15]); 1680 s[8] = _mm_packs_epi32(u[16], u[17]); 1681 s[9] = _mm_packs_epi32(u[18], u[19]); 1682 s[10] = _mm_packs_epi32(u[20], u[21]); 1683 s[11] = _mm_packs_epi32(u[22], u[23]); 1684 s[12] = _mm_packs_epi32(u[24], u[25]); 1685 s[13] = _mm_packs_epi32(u[26], u[27]); 1686 s[14] = _mm_packs_epi32(u[28], u[29]); 1687 s[15] = _mm_packs_epi32(u[30], u[31]); 1688 1689 // stage 2 1690 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1691 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1692 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1693 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1694 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1695 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1696 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1697 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1698 1699 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1700 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1701 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1702 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1703 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1704 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1705 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1706 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1707 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1708 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1709 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1710 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1711 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1712 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1713 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1714 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1715 1716 u[0] = _mm_add_epi32(v[0], v[8]); 1717 u[1] = _mm_add_epi32(v[1], v[9]); 1718 u[2] = _mm_add_epi32(v[2], v[10]); 1719 u[3] = _mm_add_epi32(v[3], v[11]); 1720 u[4] = _mm_add_epi32(v[4], v[12]); 1721 u[5] = _mm_add_epi32(v[5], v[13]); 1722 u[6] = _mm_add_epi32(v[6], v[14]); 1723 u[7] = _mm_add_epi32(v[7], v[15]); 1724 u[8] = _mm_sub_epi32(v[0], v[8]); 1725 u[9] = _mm_sub_epi32(v[1], v[9]); 1726 u[10] = _mm_sub_epi32(v[2], v[10]); 1727 u[11] = _mm_sub_epi32(v[3], v[11]); 1728 u[12] = _mm_sub_epi32(v[4], v[12]); 1729 u[13] = _mm_sub_epi32(v[5], v[13]); 1730 u[14] = _mm_sub_epi32(v[6], v[14]); 1731 u[15] = _mm_sub_epi32(v[7], v[15]); 1732 1733 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1734 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1735 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1736 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1737 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1738 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1739 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1740 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1741 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1742 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1743 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1744 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1745 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1746 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1747 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1748 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1749 1750 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1751 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1752 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1753 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1754 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1755 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1756 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1757 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1758 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1759 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1760 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1761 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1762 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1763 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1764 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1765 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1766 1767 x[0] = _mm_add_epi16(s[0], s[4]); 1768 x[1] = _mm_add_epi16(s[1], s[5]); 1769 x[2] = _mm_add_epi16(s[2], s[6]); 1770 x[3] = _mm_add_epi16(s[3], s[7]); 1771 x[4] = _mm_sub_epi16(s[0], s[4]); 1772 x[5] = _mm_sub_epi16(s[1], s[5]); 1773 x[6] = _mm_sub_epi16(s[2], s[6]); 1774 x[7] = _mm_sub_epi16(s[3], s[7]); 1775 x[8] = _mm_packs_epi32(u[0], u[1]); 1776 x[9] = _mm_packs_epi32(u[2], u[3]); 1777 x[10] = _mm_packs_epi32(u[4], u[5]); 1778 x[11] = _mm_packs_epi32(u[6], u[7]); 1779 x[12] = _mm_packs_epi32(u[8], u[9]); 1780 x[13] = _mm_packs_epi32(u[10], u[11]); 1781 x[14] = _mm_packs_epi32(u[12], u[13]); 1782 x[15] = _mm_packs_epi32(u[14], u[15]); 1783 1784 // stage 3 1785 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1786 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1787 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1788 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1789 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1790 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1791 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1792 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1793 1794 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1795 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1796 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1797 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1798 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1799 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1800 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1801 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1802 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1803 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1804 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1805 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1806 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1807 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1808 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1809 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1810 1811 u[0] = _mm_add_epi32(v[0], v[4]); 1812 u[1] = _mm_add_epi32(v[1], v[5]); 1813 u[2] = _mm_add_epi32(v[2], v[6]); 1814 u[3] = _mm_add_epi32(v[3], v[7]); 1815 u[4] = _mm_sub_epi32(v[0], v[4]); 1816 u[5] = _mm_sub_epi32(v[1], v[5]); 1817 u[6] = _mm_sub_epi32(v[2], v[6]); 1818 u[7] = _mm_sub_epi32(v[3], v[7]); 1819 u[8] = _mm_add_epi32(v[8], v[12]); 1820 u[9] = _mm_add_epi32(v[9], v[13]); 1821 u[10] = _mm_add_epi32(v[10], v[14]); 1822 u[11] = _mm_add_epi32(v[11], v[15]); 1823 u[12] = _mm_sub_epi32(v[8], v[12]); 1824 u[13] = _mm_sub_epi32(v[9], v[13]); 1825 u[14] = _mm_sub_epi32(v[10], v[14]); 1826 u[15] = _mm_sub_epi32(v[11], v[15]); 1827 1828 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1829 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1830 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1831 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1832 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1833 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1834 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1835 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1836 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1837 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1838 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1839 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1840 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1841 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1842 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1843 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1844 1845 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1846 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1847 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1848 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1849 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1850 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1851 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1852 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1853 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1854 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1855 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1856 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1857 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1858 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1859 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1860 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1861 1862 s[0] = _mm_add_epi16(x[0], x[2]); 1863 s[1] = _mm_add_epi16(x[1], x[3]); 1864 s[2] = _mm_sub_epi16(x[0], x[2]); 1865 s[3] = _mm_sub_epi16(x[1], x[3]); 1866 s[4] = _mm_packs_epi32(v[0], v[1]); 1867 s[5] = _mm_packs_epi32(v[2], v[3]); 1868 s[6] = _mm_packs_epi32(v[4], v[5]); 1869 s[7] = _mm_packs_epi32(v[6], v[7]); 1870 s[8] = _mm_add_epi16(x[8], x[10]); 1871 s[9] = _mm_add_epi16(x[9], x[11]); 1872 s[10] = _mm_sub_epi16(x[8], x[10]); 1873 s[11] = _mm_sub_epi16(x[9], x[11]); 1874 s[12] = _mm_packs_epi32(v[8], v[9]); 1875 s[13] = _mm_packs_epi32(v[10], v[11]); 1876 s[14] = _mm_packs_epi32(v[12], v[13]); 1877 s[15] = _mm_packs_epi32(v[14], v[15]); 1878 1879 // stage 4 1880 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1881 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1882 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1883 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1884 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1885 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1886 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1887 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1888 1889 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1890 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1891 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1892 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1893 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1894 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1895 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1896 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1897 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1898 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1899 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1900 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1901 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1902 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1903 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1904 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1905 1906 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1907 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1908 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1909 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1910 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1911 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1912 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1913 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1914 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1915 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1916 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1917 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1918 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1919 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1920 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1921 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1922 1923 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1924 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1925 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1926 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1927 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1928 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1929 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1930 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1931 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1932 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1933 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1934 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1935 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1936 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1937 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1938 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1939 1940 in[0] = s[0]; 1941 in[1] = _mm_sub_epi16(kZero, s[8]); 1942 in[2] = s[12]; 1943 in[3] = _mm_sub_epi16(kZero, s[4]); 1944 in[4] = _mm_packs_epi32(v[4], v[5]); 1945 in[5] = _mm_packs_epi32(v[12], v[13]); 1946 in[6] = _mm_packs_epi32(v[8], v[9]); 1947 in[7] = _mm_packs_epi32(v[0], v[1]); 1948 in[8] = _mm_packs_epi32(v[2], v[3]); 1949 in[9] = _mm_packs_epi32(v[10], v[11]); 1950 in[10] = _mm_packs_epi32(v[14], v[15]); 1951 in[11] = _mm_packs_epi32(v[6], v[7]); 1952 in[12] = s[5]; 1953 in[13] = _mm_sub_epi16(kZero, s[13]); 1954 in[14] = s[9]; 1955 in[15] = _mm_sub_epi16(kZero, s[1]); 1956} 1957 1958static void idct16_1d_8col(__m128i *in) { 1959 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1960 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1961 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1962 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1963 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1964 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1965 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1966 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1967 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1968 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1969 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1970 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1971 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1972 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1973 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1974 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1975 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1976 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1977 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1978 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1979 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1980 __m128i v[16], u[16], s[16], t[16]; 1981 1982 // stage 1 1983 s[0] = in[0]; 1984 s[1] = in[8]; 1985 s[2] = in[4]; 1986 s[3] = in[12]; 1987 s[4] = in[2]; 1988 s[5] = in[10]; 1989 s[6] = in[6]; 1990 s[7] = in[14]; 1991 s[8] = in[1]; 1992 s[9] = in[9]; 1993 s[10] = in[5]; 1994 s[11] = in[13]; 1995 s[12] = in[3]; 1996 s[13] = in[11]; 1997 s[14] = in[7]; 1998 s[15] = in[15]; 1999 2000 // stage 2 2001 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 2002 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 2003 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 2004 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 2005 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 2006 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 2007 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 2008 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 2009 2010 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 2011 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 2012 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 2013 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 2014 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 2015 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 2016 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 2017 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 2018 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 2019 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 2020 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 2021 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 2022 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 2023 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 2024 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 2025 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 2026 2027 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2028 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2029 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2030 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2031 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2032 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2033 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2034 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2035 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2036 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2037 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2038 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2039 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2040 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2041 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2042 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2043 2044 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2045 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2046 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2047 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2048 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2049 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2050 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2051 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2052 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2053 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2054 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2055 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2056 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2057 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2058 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2059 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2060 2061 s[8] = _mm_packs_epi32(u[0], u[1]); 2062 s[15] = _mm_packs_epi32(u[2], u[3]); 2063 s[9] = _mm_packs_epi32(u[4], u[5]); 2064 s[14] = _mm_packs_epi32(u[6], u[7]); 2065 s[10] = _mm_packs_epi32(u[8], u[9]); 2066 s[13] = _mm_packs_epi32(u[10], u[11]); 2067 s[11] = _mm_packs_epi32(u[12], u[13]); 2068 s[12] = _mm_packs_epi32(u[14], u[15]); 2069 2070 // stage 3 2071 t[0] = s[0]; 2072 t[1] = s[1]; 2073 t[2] = s[2]; 2074 t[3] = s[3]; 2075 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 2076 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 2077 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 2078 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 2079 2080 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2081 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2082 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2083 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2084 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2085 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2086 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2087 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2088 2089 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2090 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2091 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2092 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2093 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2094 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2095 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2096 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2097 2098 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2099 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2100 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2101 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2102 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2103 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2104 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2105 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2106 2107 t[4] = _mm_packs_epi32(u[0], u[1]); 2108 t[7] = _mm_packs_epi32(u[2], u[3]); 2109 t[5] = _mm_packs_epi32(u[4], u[5]); 2110 t[6] = _mm_packs_epi32(u[6], u[7]); 2111 t[8] = _mm_add_epi16(s[8], s[9]); 2112 t[9] = _mm_sub_epi16(s[8], s[9]); 2113 t[10] = _mm_sub_epi16(s[11], s[10]); 2114 t[11] = _mm_add_epi16(s[10], s[11]); 2115 t[12] = _mm_add_epi16(s[12], s[13]); 2116 t[13] = _mm_sub_epi16(s[12], s[13]); 2117 t[14] = _mm_sub_epi16(s[15], s[14]); 2118 t[15] = _mm_add_epi16(s[14], s[15]); 2119 2120 // stage 4 2121 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 2122 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 2123 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 2124 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 2125 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 2126 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 2127 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 2128 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 2129 2130 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2131 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2132 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2133 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2134 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 2135 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 2136 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2137 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2138 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 2139 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 2140 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 2141 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 2142 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 2143 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 2144 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 2145 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 2146 2147 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2148 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2149 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2150 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2151 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2152 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2153 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2154 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2155 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2156 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2157 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2158 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2159 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2160 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2161 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2162 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2163 2164 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2165 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2166 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2167 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2168 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2169 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2170 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2171 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2172 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2173 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2174 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2175 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2176 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2177 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2178 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2179 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2180 2181 s[0] = _mm_packs_epi32(u[0], u[1]); 2182 s[1] = _mm_packs_epi32(u[2], u[3]); 2183 s[2] = _mm_packs_epi32(u[4], u[5]); 2184 s[3] = _mm_packs_epi32(u[6], u[7]); 2185 s[4] = _mm_add_epi16(t[4], t[5]); 2186 s[5] = _mm_sub_epi16(t[4], t[5]); 2187 s[6] = _mm_sub_epi16(t[7], t[6]); 2188 s[7] = _mm_add_epi16(t[6], t[7]); 2189 s[8] = t[8]; 2190 s[15] = t[15]; 2191 s[9] = _mm_packs_epi32(u[8], u[9]); 2192 s[14] = _mm_packs_epi32(u[10], u[11]); 2193 s[10] = _mm_packs_epi32(u[12], u[13]); 2194 s[13] = _mm_packs_epi32(u[14], u[15]); 2195 s[11] = t[11]; 2196 s[12] = t[12]; 2197 2198 // stage 5 2199 t[0] = _mm_add_epi16(s[0], s[3]); 2200 t[1] = _mm_add_epi16(s[1], s[2]); 2201 t[2] = _mm_sub_epi16(s[1], s[2]); 2202 t[3] = _mm_sub_epi16(s[0], s[3]); 2203 t[4] = s[4]; 2204 t[7] = s[7]; 2205 2206 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2207 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2208 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2209 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2210 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2211 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2212 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2213 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2214 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2215 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2216 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2217 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2218 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2219 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2220 t[5] = _mm_packs_epi32(u[0], u[1]); 2221 t[6] = _mm_packs_epi32(u[2], u[3]); 2222 2223 t[8] = _mm_add_epi16(s[8], s[11]); 2224 t[9] = _mm_add_epi16(s[9], s[10]); 2225 t[10] = _mm_sub_epi16(s[9], s[10]); 2226 t[11] = _mm_sub_epi16(s[8], s[11]); 2227 t[12] = _mm_sub_epi16(s[15], s[12]); 2228 t[13] = _mm_sub_epi16(s[14], s[13]); 2229 t[14] = _mm_add_epi16(s[13], s[14]); 2230 t[15] = _mm_add_epi16(s[12], s[15]); 2231 2232 // stage 6 2233 s[0] = _mm_add_epi16(t[0], t[7]); 2234 s[1] = _mm_add_epi16(t[1], t[6]); 2235 s[2] = _mm_add_epi16(t[2], t[5]); 2236 s[3] = _mm_add_epi16(t[3], t[4]); 2237 s[4] = _mm_sub_epi16(t[3], t[4]); 2238 s[5] = _mm_sub_epi16(t[2], t[5]); 2239 s[6] = _mm_sub_epi16(t[1], t[6]); 2240 s[7] = _mm_sub_epi16(t[0], t[7]); 2241 s[8] = t[8]; 2242 s[9] = t[9]; 2243 2244 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2245 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2246 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2247 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2248 2249 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2250 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2251 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2252 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2253 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2254 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2255 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2256 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2257 2258 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2259 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2260 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2261 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2262 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2263 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2264 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2265 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2266 2267 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2268 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2269 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2270 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2271 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2272 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2273 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2274 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2275 2276 s[10] = _mm_packs_epi32(u[0], u[1]); 2277 s[13] = _mm_packs_epi32(u[2], u[3]); 2278 s[11] = _mm_packs_epi32(u[4], u[5]); 2279 s[12] = _mm_packs_epi32(u[6], u[7]); 2280 s[14] = t[14]; 2281 s[15] = t[15]; 2282 2283 // stage 7 2284 in[0] = _mm_add_epi16(s[0], s[15]); 2285 in[1] = _mm_add_epi16(s[1], s[14]); 2286 in[2] = _mm_add_epi16(s[2], s[13]); 2287 in[3] = _mm_add_epi16(s[3], s[12]); 2288 in[4] = _mm_add_epi16(s[4], s[11]); 2289 in[5] = _mm_add_epi16(s[5], s[10]); 2290 in[6] = _mm_add_epi16(s[6], s[9]); 2291 in[7] = _mm_add_epi16(s[7], s[8]); 2292 in[8] = _mm_sub_epi16(s[7], s[8]); 2293 in[9] = _mm_sub_epi16(s[6], s[9]); 2294 in[10] = _mm_sub_epi16(s[5], s[10]); 2295 in[11] = _mm_sub_epi16(s[4], s[11]); 2296 in[12] = _mm_sub_epi16(s[3], s[12]); 2297 in[13] = _mm_sub_epi16(s[2], s[13]); 2298 in[14] = _mm_sub_epi16(s[1], s[14]); 2299 in[15] = _mm_sub_epi16(s[0], s[15]); 2300} 2301 2302static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { 2303 array_transpose_16x16(in0, in1); 2304 idct16_1d_8col(in0); 2305 idct16_1d_8col(in1); 2306} 2307 2308static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { 2309 array_transpose_16x16(in0, in1); 2310 iadst16_1d_8col(in0); 2311 iadst16_1d_8col(in1); 2312} 2313 2314static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 2315 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 2316 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 2317 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 2318 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 2319 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 2320 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 2321 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 2322 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 2323 2324 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 2325 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 2326 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 2327 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 2328 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 2329 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 2330 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 2331 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 2332} 2333 2334static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 2335 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2336 const __m128i zero = _mm_setzero_si128(); 2337 // Final rounding and shift 2338 in[0] = _mm_adds_epi16(in[0], final_rounding); 2339 in[1] = _mm_adds_epi16(in[1], final_rounding); 2340 in[2] = _mm_adds_epi16(in[2], final_rounding); 2341 in[3] = _mm_adds_epi16(in[3], final_rounding); 2342 in[4] = _mm_adds_epi16(in[4], final_rounding); 2343 in[5] = _mm_adds_epi16(in[5], final_rounding); 2344 in[6] = _mm_adds_epi16(in[6], final_rounding); 2345 in[7] = _mm_adds_epi16(in[7], final_rounding); 2346 in[8] = _mm_adds_epi16(in[8], final_rounding); 2347 in[9] = _mm_adds_epi16(in[9], final_rounding); 2348 in[10] = _mm_adds_epi16(in[10], final_rounding); 2349 in[11] = _mm_adds_epi16(in[11], final_rounding); 2350 in[12] = _mm_adds_epi16(in[12], final_rounding); 2351 in[13] = _mm_adds_epi16(in[13], final_rounding); 2352 in[14] = _mm_adds_epi16(in[14], final_rounding); 2353 in[15] = _mm_adds_epi16(in[15], final_rounding); 2354 2355 in[0] = _mm_srai_epi16(in[0], 6); 2356 in[1] = _mm_srai_epi16(in[1], 6); 2357 in[2] = _mm_srai_epi16(in[2], 6); 2358 in[3] = _mm_srai_epi16(in[3], 6); 2359 in[4] = _mm_srai_epi16(in[4], 6); 2360 in[5] = _mm_srai_epi16(in[5], 6); 2361 in[6] = _mm_srai_epi16(in[6], 6); 2362 in[7] = _mm_srai_epi16(in[7], 6); 2363 in[8] = _mm_srai_epi16(in[8], 6); 2364 in[9] = _mm_srai_epi16(in[9], 6); 2365 in[10] = _mm_srai_epi16(in[10], 6); 2366 in[11] = _mm_srai_epi16(in[11], 6); 2367 in[12] = _mm_srai_epi16(in[12], 6); 2368 in[13] = _mm_srai_epi16(in[13], 6); 2369 in[14] = _mm_srai_epi16(in[14], 6); 2370 in[15] = _mm_srai_epi16(in[15], 6); 2371 2372 RECON_AND_STORE(dest, in[0]); 2373 RECON_AND_STORE(dest, in[1]); 2374 RECON_AND_STORE(dest, in[2]); 2375 RECON_AND_STORE(dest, in[3]); 2376 RECON_AND_STORE(dest, in[4]); 2377 RECON_AND_STORE(dest, in[5]); 2378 RECON_AND_STORE(dest, in[6]); 2379 RECON_AND_STORE(dest, in[7]); 2380 RECON_AND_STORE(dest, in[8]); 2381 RECON_AND_STORE(dest, in[9]); 2382 RECON_AND_STORE(dest, in[10]); 2383 RECON_AND_STORE(dest, in[11]); 2384 RECON_AND_STORE(dest, in[12]); 2385 RECON_AND_STORE(dest, in[13]); 2386 RECON_AND_STORE(dest, in[14]); 2387 RECON_AND_STORE(dest, in[15]); 2388} 2389 2390void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 2391 int tx_type) { 2392 __m128i in0[16], in1[16]; 2393 2394 load_buffer_8x16(input, in0); 2395 input += 8; 2396 load_buffer_8x16(input, in1); 2397 2398 switch (tx_type) { 2399 case 0: // DCT_DCT 2400 idct16_1d_sse2(in0, in1); 2401 idct16_1d_sse2(in0, in1); 2402 break; 2403 case 1: // ADST_DCT 2404 idct16_1d_sse2(in0, in1); 2405 iadst16_1d_sse2(in0, in1); 2406 break; 2407 case 2: // DCT_ADST 2408 iadst16_1d_sse2(in0, in1); 2409 idct16_1d_sse2(in0, in1); 2410 break; 2411 case 3: // ADST_ADST 2412 iadst16_1d_sse2(in0, in1); 2413 iadst16_1d_sse2(in0, in1); 2414 break; 2415 default: 2416 assert(0); 2417 break; 2418 } 2419 2420 write_buffer_8x16(dest, in0, stride); 2421 dest += 8; 2422 write_buffer_8x16(dest, in1, stride); 2423} 2424 2425void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2426 int stride) { 2427 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2428 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2429 const __m128i zero = _mm_setzero_si128(); 2430 2431 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2432 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2433 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 2434 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 2435 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 2436 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 2437 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2438 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2439 2440 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2441 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2442 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2443 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 2444 2445 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2446 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2447 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2448 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2449 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2450 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2451 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2452 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2453 2454 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2455 2456 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 2457 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 2458 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 2459 in14 = zero, in15 = zero; 2460 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 2461 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 2462 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 2463 2464 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2465 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2466 stp1_8_0, stp1_12_0; 2467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2468 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 2469 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2470 int i; 2471 // 1-D idct. Load input data. 2472 in0 = _mm_load_si128((const __m128i *)input); 2473 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 2474 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2475 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 2476 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2477 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 2478 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2479 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 2480 2481 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 2482 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 2483 2484 // Stage2 2485 { 2486 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 2487 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 2488 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 2489 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 2490 2491 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2492 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2493 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 2494 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 2495 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 2496 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 2497 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2498 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2499 2500 tmp0 = _mm_add_epi32(tmp0, rounding); 2501 tmp2 = _mm_add_epi32(tmp2, rounding); 2502 tmp4 = _mm_add_epi32(tmp4, rounding); 2503 tmp6 = _mm_add_epi32(tmp6, rounding); 2504 tmp1 = _mm_add_epi32(tmp1, rounding); 2505 tmp3 = _mm_add_epi32(tmp3, rounding); 2506 tmp5 = _mm_add_epi32(tmp5, rounding); 2507 tmp7 = _mm_add_epi32(tmp7, rounding); 2508 2509 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2510 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2511 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2512 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2513 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2514 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2515 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2516 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2517 2518 stp2_8 = _mm_packs_epi32(tmp0, zero); 2519 stp2_15 = _mm_packs_epi32(tmp2, zero); 2520 stp2_9 = _mm_packs_epi32(tmp4, zero); 2521 stp2_14 = _mm_packs_epi32(tmp6, zero); 2522 2523 stp2_10 = _mm_packs_epi32(tmp1, zero); 2524 stp2_13 = _mm_packs_epi32(tmp3, zero); 2525 stp2_11 = _mm_packs_epi32(tmp5, zero); 2526 stp2_12 = _mm_packs_epi32(tmp7, zero); 2527 } 2528 2529 // Stage3 2530 { 2531 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 2532 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 2533 2534 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2535 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2536 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 2537 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 2538 2539 tmp0 = _mm_add_epi32(tmp0, rounding); 2540 tmp2 = _mm_add_epi32(tmp2, rounding); 2541 tmp4 = _mm_add_epi32(tmp4, rounding); 2542 tmp6 = _mm_add_epi32(tmp6, rounding); 2543 2544 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2545 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2546 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2547 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2548 2549 stp1_4 = _mm_packs_epi32(tmp0, zero); 2550 stp1_7 = _mm_packs_epi32(tmp2, zero); 2551 stp1_5 = _mm_packs_epi32(tmp4, zero); 2552 stp1_6 = _mm_packs_epi32(tmp6, zero); 2553 2554 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); 2555 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 2556 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 2557 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 2558 2559 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 2560 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 2561 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 2562 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 2563 } 2564 2565 // Stage4 2566 { 2567 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 2568 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 2569 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 2570 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2571 2572 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2573 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2574 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 2575 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 2576 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2577 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2578 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2579 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2580 2581 tmp0 = _mm_add_epi32(tmp0, rounding); 2582 tmp2 = _mm_add_epi32(tmp2, rounding); 2583 tmp4 = _mm_add_epi32(tmp4, rounding); 2584 tmp6 = _mm_add_epi32(tmp6, rounding); 2585 tmp1 = _mm_add_epi32(tmp1, rounding); 2586 tmp3 = _mm_add_epi32(tmp3, rounding); 2587 tmp5 = _mm_add_epi32(tmp5, rounding); 2588 tmp7 = _mm_add_epi32(tmp7, rounding); 2589 2590 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2591 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2592 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2593 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2594 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2595 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2596 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2597 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2598 2599 stp2_0 = _mm_packs_epi32(tmp0, zero); 2600 stp2_1 = _mm_packs_epi32(tmp2, zero); 2601 stp2_2 = _mm_packs_epi32(tmp4, zero); 2602 stp2_3 = _mm_packs_epi32(tmp6, zero); 2603 stp2_9 = _mm_packs_epi32(tmp1, zero); 2604 stp2_14 = _mm_packs_epi32(tmp3, zero); 2605 stp2_10 = _mm_packs_epi32(tmp5, zero); 2606 stp2_13 = _mm_packs_epi32(tmp7, zero); 2607 2608 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 2609 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 2610 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 2611 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 2612 } 2613 2614 // Stage5 and Stage6 2615 { 2616 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 2617 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 2618 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 2619 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 2620 2621 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); 2622 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 2623 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 2624 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); 2625 2626 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); 2627 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 2628 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 2629 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); 2630 } 2631 2632 // Stage6 2633 { 2634 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 2635 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2636 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2637 2638 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2639 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2640 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2641 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2642 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2643 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2644 2645 tmp1 = _mm_add_epi32(tmp1, rounding); 2646 tmp3 = _mm_add_epi32(tmp3, rounding); 2647 tmp0 = _mm_add_epi32(tmp0, rounding); 2648 tmp2 = _mm_add_epi32(tmp2, rounding); 2649 tmp4 = _mm_add_epi32(tmp4, rounding); 2650 tmp6 = _mm_add_epi32(tmp6, rounding); 2651 2652 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2653 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2654 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2655 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2656 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2657 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2658 2659 stp1_5 = _mm_packs_epi32(tmp1, zero); 2660 stp1_6 = _mm_packs_epi32(tmp3, zero); 2661 stp2_10 = _mm_packs_epi32(tmp0, zero); 2662 stp2_13 = _mm_packs_epi32(tmp2, zero); 2663 stp2_11 = _mm_packs_epi32(tmp4, zero); 2664 stp2_12 = _mm_packs_epi32(tmp6, zero); 2665 2666 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); 2667 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 2668 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 2669 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 2670 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 2671 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 2672 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 2673 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 2674 } 2675 2676 // Stage7. Left 8x16 only. 2677 l0 = _mm_add_epi16(stp2_0, stp1_15); 2678 l1 = _mm_add_epi16(stp2_1, stp1_14); 2679 l2 = _mm_add_epi16(stp2_2, stp2_13); 2680 l3 = _mm_add_epi16(stp2_3, stp2_12); 2681 l4 = _mm_add_epi16(stp2_4, stp2_11); 2682 l5 = _mm_add_epi16(stp2_5, stp2_10); 2683 l6 = _mm_add_epi16(stp2_6, stp1_9); 2684 l7 = _mm_add_epi16(stp2_7, stp1_8); 2685 l8 = _mm_sub_epi16(stp2_7, stp1_8); 2686 l9 = _mm_sub_epi16(stp2_6, stp1_9); 2687 l10 = _mm_sub_epi16(stp2_5, stp2_10); 2688 l11 = _mm_sub_epi16(stp2_4, stp2_11); 2689 l12 = _mm_sub_epi16(stp2_3, stp2_12); 2690 l13 = _mm_sub_epi16(stp2_2, stp2_13); 2691 l14 = _mm_sub_epi16(stp2_1, stp1_14); 2692 l15 = _mm_sub_epi16(stp2_0, stp1_15); 2693 2694 // 2-D idct. We do 2 8x16 blocks. 2695 for (i = 0; i < 2; i++) { 2696 if (i == 0) 2697 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 2698 in5, in6, in7); 2699 2700 if (i == 1) 2701 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 2702 in4, in5, in6, in7); 2703 2704 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; 2705 2706 IDCT16_1D 2707 2708 // Stage7 2709 in0 = _mm_add_epi16(stp2_0, stp1_15); 2710 in1 = _mm_add_epi16(stp2_1, stp1_14); 2711 in2 = _mm_add_epi16(stp2_2, stp2_13); 2712 in3 = _mm_add_epi16(stp2_3, stp2_12); 2713 in4 = _mm_add_epi16(stp2_4, stp2_11); 2714 in5 = _mm_add_epi16(stp2_5, stp2_10); 2715 in6 = _mm_add_epi16(stp2_6, stp1_9); 2716 in7 = _mm_add_epi16(stp2_7, stp1_8); 2717 in8 = _mm_sub_epi16(stp2_7, stp1_8); 2718 in9 = _mm_sub_epi16(stp2_6, stp1_9); 2719 in10 = _mm_sub_epi16(stp2_5, stp2_10); 2720 in11 = _mm_sub_epi16(stp2_4, stp2_11); 2721 in12 = _mm_sub_epi16(stp2_3, stp2_12); 2722 in13 = _mm_sub_epi16(stp2_2, stp2_13); 2723 in14 = _mm_sub_epi16(stp2_1, stp1_14); 2724 in15 = _mm_sub_epi16(stp2_0, stp1_15); 2725 2726 // Final rounding and shift 2727 in0 = _mm_adds_epi16(in0, final_rounding); 2728 in1 = _mm_adds_epi16(in1, final_rounding); 2729 in2 = _mm_adds_epi16(in2, final_rounding); 2730 in3 = _mm_adds_epi16(in3, final_rounding); 2731 in4 = _mm_adds_epi16(in4, final_rounding); 2732 in5 = _mm_adds_epi16(in5, final_rounding); 2733 in6 = _mm_adds_epi16(in6, final_rounding); 2734 in7 = _mm_adds_epi16(in7, final_rounding); 2735 in8 = _mm_adds_epi16(in8, final_rounding); 2736 in9 = _mm_adds_epi16(in9, final_rounding); 2737 in10 = _mm_adds_epi16(in10, final_rounding); 2738 in11 = _mm_adds_epi16(in11, final_rounding); 2739 in12 = _mm_adds_epi16(in12, final_rounding); 2740 in13 = _mm_adds_epi16(in13, final_rounding); 2741 in14 = _mm_adds_epi16(in14, final_rounding); 2742 in15 = _mm_adds_epi16(in15, final_rounding); 2743 2744 in0 = _mm_srai_epi16(in0, 6); 2745 in1 = _mm_srai_epi16(in1, 6); 2746 in2 = _mm_srai_epi16(in2, 6); 2747 in3 = _mm_srai_epi16(in3, 6); 2748 in4 = _mm_srai_epi16(in4, 6); 2749 in5 = _mm_srai_epi16(in5, 6); 2750 in6 = _mm_srai_epi16(in6, 6); 2751 in7 = _mm_srai_epi16(in7, 6); 2752 in8 = _mm_srai_epi16(in8, 6); 2753 in9 = _mm_srai_epi16(in9, 6); 2754 in10 = _mm_srai_epi16(in10, 6); 2755 in11 = _mm_srai_epi16(in11, 6); 2756 in12 = _mm_srai_epi16(in12, 6); 2757 in13 = _mm_srai_epi16(in13, 6); 2758 in14 = _mm_srai_epi16(in14, 6); 2759 in15 = _mm_srai_epi16(in15, 6); 2760 2761 RECON_AND_STORE(dest, in0); 2762 RECON_AND_STORE(dest, in1); 2763 RECON_AND_STORE(dest, in2); 2764 RECON_AND_STORE(dest, in3); 2765 RECON_AND_STORE(dest, in4); 2766 RECON_AND_STORE(dest, in5); 2767 RECON_AND_STORE(dest, in6); 2768 RECON_AND_STORE(dest, in7); 2769 RECON_AND_STORE(dest, in8); 2770 RECON_AND_STORE(dest, in9); 2771 RECON_AND_STORE(dest, in10); 2772 RECON_AND_STORE(dest, in11); 2773 RECON_AND_STORE(dest, in12); 2774 RECON_AND_STORE(dest, in13); 2775 RECON_AND_STORE(dest, in14); 2776 RECON_AND_STORE(dest, in15); 2777 2778 dest += 8 - (stride * 16); 2779 } 2780} 2781 2782#define LOAD_DQCOEFF(reg, input) \ 2783 { \ 2784 reg = _mm_load_si128((const __m128i *) input); \ 2785 input += 8; \ 2786 } \ 2787 2788#define IDCT32_1D \ 2789/* Stage1 */ \ 2790{ \ 2791 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ 2792 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ 2793 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ 2794 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ 2795 \ 2796 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ 2797 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ 2798 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ 2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ 2800 \ 2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ 2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ 2803 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ 2804 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ 2805 \ 2806 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ 2807 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ 2808 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ 2809 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ 2810 \ 2811 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2812 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 2813 stp1_17, stp1_30) \ 2814 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 2815 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 2816 stp1_19, stp1_28) \ 2817 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2818 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2819 stp1_21, stp1_26) \ 2820 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2821 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2822 stp1_23, stp1_24) \ 2823} \ 2824\ 2825/* Stage2 */ \ 2826{ \ 2827 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ 2828 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ 2829 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ 2830 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ 2831 \ 2832 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ 2833 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ 2834 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ 2835 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ 2836 \ 2837 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 2838 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 2839 stp2_14) \ 2840 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 2841 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 2842 stp2_11, stp2_12) \ 2843 \ 2844 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 2845 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 2846 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 2847 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 2848 \ 2849 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 2850 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 2851 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 2852 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 2853 \ 2854 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 2855 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 2856 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 2857 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 2858 \ 2859 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 2860 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 2861 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 2862 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 2863} \ 2864\ 2865/* Stage3 */ \ 2866{ \ 2867 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ 2868 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ 2869 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ 2870 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ 2871 \ 2872 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 2873 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 2874 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2875 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2876 \ 2877 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2878 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2879 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2880 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2881 \ 2882 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 2883 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 2884 stp1_6) \ 2885 \ 2886 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 2887 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 2888 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 2889 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 2890 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 2891 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 2892 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 2893 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 2894 \ 2895 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2896 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2897 stp1_18, stp1_29) \ 2898 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2899 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2900 stp1_22, stp1_25) \ 2901 \ 2902 stp1_16 = stp2_16; \ 2903 stp1_31 = stp2_31; \ 2904 stp1_19 = stp2_19; \ 2905 stp1_20 = stp2_20; \ 2906 stp1_23 = stp2_23; \ 2907 stp1_24 = stp2_24; \ 2908 stp1_27 = stp2_27; \ 2909 stp1_28 = stp2_28; \ 2910} \ 2911\ 2912/* Stage4 */ \ 2913{ \ 2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ 2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ 2916 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ 2917 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ 2918 \ 2919 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 2920 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 2921 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2922 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2923 \ 2924 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 2925 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 2926 stp2_2, stp2_3) \ 2927 \ 2928 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 2929 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 2930 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 2931 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 2932 \ 2933 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2934 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2935 stp2_10, stp2_13) \ 2936 \ 2937 stp2_8 = stp1_8; \ 2938 stp2_15 = stp1_15; \ 2939 stp2_11 = stp1_11; \ 2940 stp2_12 = stp1_12; \ 2941 \ 2942 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2943 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2944 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2945 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2946 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2947 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2948 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2949 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2950 \ 2951 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2952 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2953 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2954 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2955 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2956 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2957 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2958 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2959} \ 2960\ 2961/* Stage5 */ \ 2962{ \ 2963 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2964 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2965 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2966 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2967 \ 2968 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2969 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2970 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2971 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2972 \ 2973 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2974 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2975 \ 2976 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 2977 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 2978 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 2979 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 2980 \ 2981 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2982 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2983 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2984 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2985 \ 2986 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2987 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2988 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2989 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2990 \ 2991 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2992 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2993 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2994 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2995 \ 2996 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2997 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2998 \ 2999 stp1_4 = stp2_4; \ 3000 stp1_7 = stp2_7; \ 3001 \ 3002 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 3003 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 3004 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 3005 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 3006 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 3007 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 3008 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 3009 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 3010 \ 3011 stp1_16 = stp2_16; \ 3012 stp1_17 = stp2_17; \ 3013 \ 3014 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 3015 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 3016 stp1_19, stp1_28) \ 3017 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 3018 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 3019 stp1_21, stp1_26) \ 3020 \ 3021 stp1_22 = stp2_22; \ 3022 stp1_23 = stp2_23; \ 3023 stp1_24 = stp2_24; \ 3024 stp1_25 = stp2_25; \ 3025 stp1_30 = stp2_30; \ 3026 stp1_31 = stp2_31; \ 3027} \ 3028\ 3029/* Stage6 */ \ 3030{ \ 3031 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3032 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3033 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 3034 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 3035 \ 3036 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 3037 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 3038 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 3039 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 3040 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 3041 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 3042 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 3043 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 3044 \ 3045 stp2_8 = stp1_8; \ 3046 stp2_9 = stp1_9; \ 3047 stp2_14 = stp1_14; \ 3048 stp2_15 = stp1_15; \ 3049 \ 3050 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 3051 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 3052 stp2_13, stp2_11, stp2_12) \ 3053 \ 3054 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 3055 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 3056 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 3057 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 3058 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 3059 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 3060 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 3061 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 3062 \ 3063 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 3064 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 3065 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 3066 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 3067 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 3068 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 3069 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 3070 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 3071} \ 3072\ 3073/* Stage7 */ \ 3074{ \ 3075 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 3076 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 3077 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3078 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3079 \ 3080 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3081 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3082 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 3083 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 3084 \ 3085 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 3086 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 3087 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 3088 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3089 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3090 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3091 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3092 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3093 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3094 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3095 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3096 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3097 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3098 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3099 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3100 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3101 \ 3102 stp1_16 = stp2_16; \ 3103 stp1_17 = stp2_17; \ 3104 stp1_18 = stp2_18; \ 3105 stp1_19 = stp2_19; \ 3106 \ 3107 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3108 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3109 stp1_21, stp1_26) \ 3110 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3111 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3112 stp1_23, stp1_24) \ 3113 \ 3114 stp1_28 = stp2_28; \ 3115 stp1_29 = stp2_29; \ 3116 stp1_30 = stp2_30; \ 3117 stp1_31 = stp2_31; \ 3118} 3119 3120// Only upper-left 8x8 has non-zero coeff 3121void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3122 int stride) { 3123 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3124 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3125 3126 // idct constants for each stage 3127 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3128 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3129 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3130 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3131 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3132 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3133 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3134 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3135 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3136 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3137 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3138 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3139 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3140 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3141 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3142 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3143 3144 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3145 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3146 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3147 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3148 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3149 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3150 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3151 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3152 3153 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3154 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3155 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3156 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3157 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3158 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3159 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3160 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3161 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3162 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3163 3164 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3165 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3166 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3167 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3168 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3169 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3170 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3171 3172 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3173 3174 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3175 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 3176 in24, in25, in26, in27, in28, in29, in30, in31; 3177 __m128i col[128]; 3178 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3179 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3180 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3181 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3182 stp1_30, stp1_31; 3183 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3184 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3185 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3186 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3187 stp2_30, stp2_31; 3188 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3189 int i, j, i32; 3190 3191 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3192 for (i = 0; i < 8; i++) { 3193 i32 = (i << 5); 3194 if (i == 0) { 3195 // First 1-D idct: first 8 rows 3196 // Load input data. 3197 LOAD_DQCOEFF(in0, input); 3198 LOAD_DQCOEFF(in8, input); 3199 LOAD_DQCOEFF(in16, input); 3200 LOAD_DQCOEFF(in24, input); 3201 LOAD_DQCOEFF(in1, input); 3202 LOAD_DQCOEFF(in9, input); 3203 LOAD_DQCOEFF(in17, input); 3204 LOAD_DQCOEFF(in25, input); 3205 LOAD_DQCOEFF(in2, input); 3206 LOAD_DQCOEFF(in10, input); 3207 LOAD_DQCOEFF(in18, input); 3208 LOAD_DQCOEFF(in26, input); 3209 LOAD_DQCOEFF(in3, input); 3210 LOAD_DQCOEFF(in11, input); 3211 LOAD_DQCOEFF(in19, input); 3212 LOAD_DQCOEFF(in27, input); 3213 3214 LOAD_DQCOEFF(in4, input); 3215 LOAD_DQCOEFF(in12, input); 3216 LOAD_DQCOEFF(in20, input); 3217 LOAD_DQCOEFF(in28, input); 3218 LOAD_DQCOEFF(in5, input); 3219 LOAD_DQCOEFF(in13, input); 3220 LOAD_DQCOEFF(in21, input); 3221 LOAD_DQCOEFF(in29, input); 3222 LOAD_DQCOEFF(in6, input); 3223 LOAD_DQCOEFF(in14, input); 3224 LOAD_DQCOEFF(in22, input); 3225 LOAD_DQCOEFF(in30, input); 3226 LOAD_DQCOEFF(in7, input); 3227 LOAD_DQCOEFF(in15, input); 3228 LOAD_DQCOEFF(in23, input); 3229 LOAD_DQCOEFF(in31, input); 3230 3231 // Transpose 32x8 block to 8x32 block 3232 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3233 in4, in5, in6, in7); 3234 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3235 in10, in11, in12, in13, in14, in15); 3236 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 3237 in18, in19, in20, in21, in22, in23); 3238 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 3239 in26, in27, in28, in29, in30, in31); 3240 } else if (i < 4) { 3241 // First 1-D idct: next 24 zero-coeff rows 3242 col[i32 + 0] = _mm_setzero_si128(); 3243 col[i32 + 1] = _mm_setzero_si128(); 3244 col[i32 + 2] = _mm_setzero_si128(); 3245 col[i32 + 3] = _mm_setzero_si128(); 3246 col[i32 + 4] = _mm_setzero_si128(); 3247 col[i32 + 5] = _mm_setzero_si128(); 3248 col[i32 + 6] = _mm_setzero_si128(); 3249 col[i32 + 7] = _mm_setzero_si128(); 3250 col[i32 + 8] = _mm_setzero_si128(); 3251 col[i32 + 9] = _mm_setzero_si128(); 3252 col[i32 + 10] = _mm_setzero_si128(); 3253 col[i32 + 11] = _mm_setzero_si128(); 3254 col[i32 + 12] = _mm_setzero_si128(); 3255 col[i32 + 13] = _mm_setzero_si128(); 3256 col[i32 + 14] = _mm_setzero_si128(); 3257 col[i32 + 15] = _mm_setzero_si128(); 3258 col[i32 + 16] = _mm_setzero_si128(); 3259 col[i32 + 17] = _mm_setzero_si128(); 3260 col[i32 + 18] = _mm_setzero_si128(); 3261 col[i32 + 19] = _mm_setzero_si128(); 3262 col[i32 + 20] = _mm_setzero_si128(); 3263 col[i32 + 21] = _mm_setzero_si128(); 3264 col[i32 + 22] = _mm_setzero_si128(); 3265 col[i32 + 23] = _mm_setzero_si128(); 3266 col[i32 + 24] = _mm_setzero_si128(); 3267 col[i32 + 25] = _mm_setzero_si128(); 3268 col[i32 + 26] = _mm_setzero_si128(); 3269 col[i32 + 27] = _mm_setzero_si128(); 3270 col[i32 + 28] = _mm_setzero_si128(); 3271 col[i32 + 29] = _mm_setzero_si128(); 3272 col[i32 + 30] = _mm_setzero_si128(); 3273 col[i32 + 31] = _mm_setzero_si128(); 3274 continue; 3275 } else { 3276 // Second 1-D idct 3277 j = i - 4; 3278 3279 // Transpose 32x8 block to 8x32 block 3280 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3281 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3282 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 3283 in5, in6, in7); 3284 j += 4; 3285 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3286 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3287 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 3288 in11, in12, in13, in14, in15); 3289 j += 4; 3290 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3291 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3292 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 3293 in19, in20, in21, in22, in23); 3294 j += 4; 3295 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3296 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3297 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 3298 in28, in29, in30, in31); 3299 } 3300 3301 IDCT32_1D 3302 3303 // final stage 3304 if (i < 4) { 3305 // 1_D: Store 32 intermediate results for each 8x32 block. 3306 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3307 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3308 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3309 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3310 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3311 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3312 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3313 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3314 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3315 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3316 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3317 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3318 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3319 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3320 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3321 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3322 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3323 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3324 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3325 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3326 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3327 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3328 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3329 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3330 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3331 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3332 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3333 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3334 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3335 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3336 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3337 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3338 } else { 3339 const __m128i zero = _mm_setzero_si128(); 3340 3341 // 2_D: Calculate the results and store them to destination. 3342 in0 = _mm_add_epi16(stp1_0, stp1_31); 3343 in1 = _mm_add_epi16(stp1_1, stp1_30); 3344 in2 = _mm_add_epi16(stp1_2, stp1_29); 3345 in3 = _mm_add_epi16(stp1_3, stp1_28); 3346 in4 = _mm_add_epi16(stp1_4, stp1_27); 3347 in5 = _mm_add_epi16(stp1_5, stp1_26); 3348 in6 = _mm_add_epi16(stp1_6, stp1_25); 3349 in7 = _mm_add_epi16(stp1_7, stp1_24); 3350 in8 = _mm_add_epi16(stp1_8, stp1_23); 3351 in9 = _mm_add_epi16(stp1_9, stp1_22); 3352 in10 = _mm_add_epi16(stp1_10, stp1_21); 3353 in11 = _mm_add_epi16(stp1_11, stp1_20); 3354 in12 = _mm_add_epi16(stp1_12, stp1_19); 3355 in13 = _mm_add_epi16(stp1_13, stp1_18); 3356 in14 = _mm_add_epi16(stp1_14, stp1_17); 3357 in15 = _mm_add_epi16(stp1_15, stp1_16); 3358 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3359 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3360 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3361 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3362 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3363 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3364 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3365 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3366 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3367 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3368 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3369 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3370 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3371 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3372 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3373 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3374 3375 // Final rounding and shift 3376 in0 = _mm_adds_epi16(in0, final_rounding); 3377 in1 = _mm_adds_epi16(in1, final_rounding); 3378 in2 = _mm_adds_epi16(in2, final_rounding); 3379 in3 = _mm_adds_epi16(in3, final_rounding); 3380 in4 = _mm_adds_epi16(in4, final_rounding); 3381 in5 = _mm_adds_epi16(in5, final_rounding); 3382 in6 = _mm_adds_epi16(in6, final_rounding); 3383 in7 = _mm_adds_epi16(in7, final_rounding); 3384 in8 = _mm_adds_epi16(in8, final_rounding); 3385 in9 = _mm_adds_epi16(in9, final_rounding); 3386 in10 = _mm_adds_epi16(in10, final_rounding); 3387 in11 = _mm_adds_epi16(in11, final_rounding); 3388 in12 = _mm_adds_epi16(in12, final_rounding); 3389 in13 = _mm_adds_epi16(in13, final_rounding); 3390 in14 = _mm_adds_epi16(in14, final_rounding); 3391 in15 = _mm_adds_epi16(in15, final_rounding); 3392 in16 = _mm_adds_epi16(in16, final_rounding); 3393 in17 = _mm_adds_epi16(in17, final_rounding); 3394 in18 = _mm_adds_epi16(in18, final_rounding); 3395 in19 = _mm_adds_epi16(in19, final_rounding); 3396 in20 = _mm_adds_epi16(in20, final_rounding); 3397 in21 = _mm_adds_epi16(in21, final_rounding); 3398 in22 = _mm_adds_epi16(in22, final_rounding); 3399 in23 = _mm_adds_epi16(in23, final_rounding); 3400 in24 = _mm_adds_epi16(in24, final_rounding); 3401 in25 = _mm_adds_epi16(in25, final_rounding); 3402 in26 = _mm_adds_epi16(in26, final_rounding); 3403 in27 = _mm_adds_epi16(in27, final_rounding); 3404 in28 = _mm_adds_epi16(in28, final_rounding); 3405 in29 = _mm_adds_epi16(in29, final_rounding); 3406 in30 = _mm_adds_epi16(in30, final_rounding); 3407 in31 = _mm_adds_epi16(in31, final_rounding); 3408 3409 in0 = _mm_srai_epi16(in0, 6); 3410 in1 = _mm_srai_epi16(in1, 6); 3411 in2 = _mm_srai_epi16(in2, 6); 3412 in3 = _mm_srai_epi16(in3, 6); 3413 in4 = _mm_srai_epi16(in4, 6); 3414 in5 = _mm_srai_epi16(in5, 6); 3415 in6 = _mm_srai_epi16(in6, 6); 3416 in7 = _mm_srai_epi16(in7, 6); 3417 in8 = _mm_srai_epi16(in8, 6); 3418 in9 = _mm_srai_epi16(in9, 6); 3419 in10 = _mm_srai_epi16(in10, 6); 3420 in11 = _mm_srai_epi16(in11, 6); 3421 in12 = _mm_srai_epi16(in12, 6); 3422 in13 = _mm_srai_epi16(in13, 6); 3423 in14 = _mm_srai_epi16(in14, 6); 3424 in15 = _mm_srai_epi16(in15, 6); 3425 in16 = _mm_srai_epi16(in16, 6); 3426 in17 = _mm_srai_epi16(in17, 6); 3427 in18 = _mm_srai_epi16(in18, 6); 3428 in19 = _mm_srai_epi16(in19, 6); 3429 in20 = _mm_srai_epi16(in20, 6); 3430 in21 = _mm_srai_epi16(in21, 6); 3431 in22 = _mm_srai_epi16(in22, 6); 3432 in23 = _mm_srai_epi16(in23, 6); 3433 in24 = _mm_srai_epi16(in24, 6); 3434 in25 = _mm_srai_epi16(in25, 6); 3435 in26 = _mm_srai_epi16(in26, 6); 3436 in27 = _mm_srai_epi16(in27, 6); 3437 in28 = _mm_srai_epi16(in28, 6); 3438 in29 = _mm_srai_epi16(in29, 6); 3439 in30 = _mm_srai_epi16(in30, 6); 3440 in31 = _mm_srai_epi16(in31, 6); 3441 3442 RECON_AND_STORE(dest, in0); 3443 RECON_AND_STORE(dest, in1); 3444 RECON_AND_STORE(dest, in2); 3445 RECON_AND_STORE(dest, in3); 3446 RECON_AND_STORE(dest, in4); 3447 RECON_AND_STORE(dest, in5); 3448 RECON_AND_STORE(dest, in6); 3449 RECON_AND_STORE(dest, in7); 3450 RECON_AND_STORE(dest, in8); 3451 RECON_AND_STORE(dest, in9); 3452 RECON_AND_STORE(dest, in10); 3453 RECON_AND_STORE(dest, in11); 3454 RECON_AND_STORE(dest, in12); 3455 RECON_AND_STORE(dest, in13); 3456 RECON_AND_STORE(dest, in14); 3457 RECON_AND_STORE(dest, in15); 3458 RECON_AND_STORE(dest, in16); 3459 RECON_AND_STORE(dest, in17); 3460 RECON_AND_STORE(dest, in18); 3461 RECON_AND_STORE(dest, in19); 3462 RECON_AND_STORE(dest, in20); 3463 RECON_AND_STORE(dest, in21); 3464 RECON_AND_STORE(dest, in22); 3465 RECON_AND_STORE(dest, in23); 3466 RECON_AND_STORE(dest, in24); 3467 RECON_AND_STORE(dest, in25); 3468 RECON_AND_STORE(dest, in26); 3469 RECON_AND_STORE(dest, in27); 3470 RECON_AND_STORE(dest, in28); 3471 RECON_AND_STORE(dest, in29); 3472 RECON_AND_STORE(dest, in30); 3473 RECON_AND_STORE(dest, in31); 3474 3475 dest += 8 - (stride * 32); 3476 } 3477 } 3478} 3479 3480void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3481 int stride) { 3482 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3483 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3484 3485 // idct constants for each stage 3486 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3487 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3488 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3489 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3490 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3491 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3492 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3493 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3494 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3495 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3496 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3497 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3498 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3499 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3500 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3501 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3502 3503 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3504 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3505 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3506 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3507 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3508 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3509 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3510 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3511 3512 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3513 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3514 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3515 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3516 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3517 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3518 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3519 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3520 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3521 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3522 3523 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3524 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3525 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3526 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3527 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3528 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3529 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3530 3531 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3532 3533 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3534 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 3535 in24, in25, in26, in27, in28, in29, in30, in31; 3536 __m128i col[128]; 3537 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3538 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3539 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3540 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3541 stp1_30, stp1_31; 3542 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3543 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3544 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3545 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3546 stp2_30, stp2_31; 3547 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3548 int i, j, i32; 3549 __m128i zero_idx[16]; 3550 int zero_flag[2]; 3551 3552 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3553 for (i = 0; i < 8; i++) { 3554 i32 = (i << 5); 3555 if (i < 4) { 3556 // First 1-D idct 3557 // Load input data. 3558 LOAD_DQCOEFF(in0, input); 3559 LOAD_DQCOEFF(in8, input); 3560 LOAD_DQCOEFF(in16, input); 3561 LOAD_DQCOEFF(in24, input); 3562 LOAD_DQCOEFF(in1, input); 3563 LOAD_DQCOEFF(in9, input); 3564 LOAD_DQCOEFF(in17, input); 3565 LOAD_DQCOEFF(in25, input); 3566 LOAD_DQCOEFF(in2, input); 3567 LOAD_DQCOEFF(in10, input); 3568 LOAD_DQCOEFF(in18, input); 3569 LOAD_DQCOEFF(in26, input); 3570 LOAD_DQCOEFF(in3, input); 3571 LOAD_DQCOEFF(in11, input); 3572 LOAD_DQCOEFF(in19, input); 3573 LOAD_DQCOEFF(in27, input); 3574 3575 LOAD_DQCOEFF(in4, input); 3576 LOAD_DQCOEFF(in12, input); 3577 LOAD_DQCOEFF(in20, input); 3578 LOAD_DQCOEFF(in28, input); 3579 LOAD_DQCOEFF(in5, input); 3580 LOAD_DQCOEFF(in13, input); 3581 LOAD_DQCOEFF(in21, input); 3582 LOAD_DQCOEFF(in29, input); 3583 LOAD_DQCOEFF(in6, input); 3584 LOAD_DQCOEFF(in14, input); 3585 LOAD_DQCOEFF(in22, input); 3586 LOAD_DQCOEFF(in30, input); 3587 LOAD_DQCOEFF(in7, input); 3588 LOAD_DQCOEFF(in15, input); 3589 LOAD_DQCOEFF(in23, input); 3590 LOAD_DQCOEFF(in31, input); 3591 3592 // checking if all entries are zero 3593 zero_idx[0] = _mm_or_si128(in0, in1); 3594 zero_idx[1] = _mm_or_si128(in2, in3); 3595 zero_idx[2] = _mm_or_si128(in4, in5); 3596 zero_idx[3] = _mm_or_si128(in6, in7); 3597 zero_idx[4] = _mm_or_si128(in8, in9); 3598 zero_idx[5] = _mm_or_si128(in10, in11); 3599 zero_idx[6] = _mm_or_si128(in12, in13); 3600 zero_idx[7] = _mm_or_si128(in14, in15); 3601 zero_idx[8] = _mm_or_si128(in16, in17); 3602 zero_idx[9] = _mm_or_si128(in18, in19); 3603 zero_idx[10] = _mm_or_si128(in20, in21); 3604 zero_idx[11] = _mm_or_si128(in22, in23); 3605 zero_idx[12] = _mm_or_si128(in24, in25); 3606 zero_idx[13] = _mm_or_si128(in26, in27); 3607 zero_idx[14] = _mm_or_si128(in28, in29); 3608 zero_idx[15] = _mm_or_si128(in30, in31); 3609 3610 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3611 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3612 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3613 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3614 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3615 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3616 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3617 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3618 3619 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3620 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3621 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3622 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3623 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3624 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3625 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3626 3627 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 3628 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); 3629 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); 3630 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); 3631 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); 3632 3633 if (!zero_flag[0] && !zero_flag[1]) { 3634 col[i32 + 0] = _mm_setzero_si128(); 3635 col[i32 + 1] = _mm_setzero_si128(); 3636 col[i32 + 2] = _mm_setzero_si128(); 3637 col[i32 + 3] = _mm_setzero_si128(); 3638 col[i32 + 4] = _mm_setzero_si128(); 3639 col[i32 + 5] = _mm_setzero_si128(); 3640 col[i32 + 6] = _mm_setzero_si128(); 3641 col[i32 + 7] = _mm_setzero_si128(); 3642 col[i32 + 8] = _mm_setzero_si128(); 3643 col[i32 + 9] = _mm_setzero_si128(); 3644 col[i32 + 10] = _mm_setzero_si128(); 3645 col[i32 + 11] = _mm_setzero_si128(); 3646 col[i32 + 12] = _mm_setzero_si128(); 3647 col[i32 + 13] = _mm_setzero_si128(); 3648 col[i32 + 14] = _mm_setzero_si128(); 3649 col[i32 + 15] = _mm_setzero_si128(); 3650 col[i32 + 16] = _mm_setzero_si128(); 3651 col[i32 + 17] = _mm_setzero_si128(); 3652 col[i32 + 18] = _mm_setzero_si128(); 3653 col[i32 + 19] = _mm_setzero_si128(); 3654 col[i32 + 20] = _mm_setzero_si128(); 3655 col[i32 + 21] = _mm_setzero_si128(); 3656 col[i32 + 22] = _mm_setzero_si128(); 3657 col[i32 + 23] = _mm_setzero_si128(); 3658 col[i32 + 24] = _mm_setzero_si128(); 3659 col[i32 + 25] = _mm_setzero_si128(); 3660 col[i32 + 26] = _mm_setzero_si128(); 3661 col[i32 + 27] = _mm_setzero_si128(); 3662 col[i32 + 28] = _mm_setzero_si128(); 3663 col[i32 + 29] = _mm_setzero_si128(); 3664 col[i32 + 30] = _mm_setzero_si128(); 3665 col[i32 + 31] = _mm_setzero_si128(); 3666 continue; 3667 } 3668 3669 // Transpose 32x8 block to 8x32 block 3670 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3671 in4, in5, in6, in7); 3672 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3673 in10, in11, in12, in13, in14, in15); 3674 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 3675 in18, in19, in20, in21, in22, in23); 3676 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 3677 in26, in27, in28, in29, in30, in31); 3678 } else { 3679 // Second 1-D idct 3680 j = i - 4; 3681 3682 // Transpose 32x8 block to 8x32 block 3683 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3684 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3685 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 3686 in5, in6, in7); 3687 j += 4; 3688 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3689 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3690 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 3691 in11, in12, in13, in14, in15); 3692 j += 4; 3693 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3694 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3695 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 3696 in19, in20, in21, in22, in23); 3697 j += 4; 3698 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 3699 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 3700 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 3701 in28, in29, in30, in31); 3702 } 3703 3704 IDCT32_1D 3705 3706 // final stage 3707 if (i < 4) { 3708 // 1_D: Store 32 intermediate results for each 8x32 block. 3709 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3710 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3711 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3712 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3713 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3714 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3715 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3716 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3717 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3718 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3719 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3720 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3721 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3722 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3723 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3724 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3725 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3726 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3727 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3728 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3729 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3730 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3731 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3732 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3733 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3734 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3735 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3736 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3737 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3738 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3739 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3740 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3741 } else { 3742 const __m128i zero = _mm_setzero_si128(); 3743 3744 // 2_D: Calculate the results and store them to destination. 3745 in0 = _mm_add_epi16(stp1_0, stp1_31); 3746 in1 = _mm_add_epi16(stp1_1, stp1_30); 3747 in2 = _mm_add_epi16(stp1_2, stp1_29); 3748 in3 = _mm_add_epi16(stp1_3, stp1_28); 3749 in4 = _mm_add_epi16(stp1_4, stp1_27); 3750 in5 = _mm_add_epi16(stp1_5, stp1_26); 3751 in6 = _mm_add_epi16(stp1_6, stp1_25); 3752 in7 = _mm_add_epi16(stp1_7, stp1_24); 3753 in8 = _mm_add_epi16(stp1_8, stp1_23); 3754 in9 = _mm_add_epi16(stp1_9, stp1_22); 3755 in10 = _mm_add_epi16(stp1_10, stp1_21); 3756 in11 = _mm_add_epi16(stp1_11, stp1_20); 3757 in12 = _mm_add_epi16(stp1_12, stp1_19); 3758 in13 = _mm_add_epi16(stp1_13, stp1_18); 3759 in14 = _mm_add_epi16(stp1_14, stp1_17); 3760 in15 = _mm_add_epi16(stp1_15, stp1_16); 3761 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3762 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3763 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3764 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3765 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3766 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3767 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3768 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3769 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3770 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3771 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3772 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3773 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3774 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3775 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3776 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3777 3778 // Final rounding and shift 3779 in0 = _mm_adds_epi16(in0, final_rounding); 3780 in1 = _mm_adds_epi16(in1, final_rounding); 3781 in2 = _mm_adds_epi16(in2, final_rounding); 3782 in3 = _mm_adds_epi16(in3, final_rounding); 3783 in4 = _mm_adds_epi16(in4, final_rounding); 3784 in5 = _mm_adds_epi16(in5, final_rounding); 3785 in6 = _mm_adds_epi16(in6, final_rounding); 3786 in7 = _mm_adds_epi16(in7, final_rounding); 3787 in8 = _mm_adds_epi16(in8, final_rounding); 3788 in9 = _mm_adds_epi16(in9, final_rounding); 3789 in10 = _mm_adds_epi16(in10, final_rounding); 3790 in11 = _mm_adds_epi16(in11, final_rounding); 3791 in12 = _mm_adds_epi16(in12, final_rounding); 3792 in13 = _mm_adds_epi16(in13, final_rounding); 3793 in14 = _mm_adds_epi16(in14, final_rounding); 3794 in15 = _mm_adds_epi16(in15, final_rounding); 3795 in16 = _mm_adds_epi16(in16, final_rounding); 3796 in17 = _mm_adds_epi16(in17, final_rounding); 3797 in18 = _mm_adds_epi16(in18, final_rounding); 3798 in19 = _mm_adds_epi16(in19, final_rounding); 3799 in20 = _mm_adds_epi16(in20, final_rounding); 3800 in21 = _mm_adds_epi16(in21, final_rounding); 3801 in22 = _mm_adds_epi16(in22, final_rounding); 3802 in23 = _mm_adds_epi16(in23, final_rounding); 3803 in24 = _mm_adds_epi16(in24, final_rounding); 3804 in25 = _mm_adds_epi16(in25, final_rounding); 3805 in26 = _mm_adds_epi16(in26, final_rounding); 3806 in27 = _mm_adds_epi16(in27, final_rounding); 3807 in28 = _mm_adds_epi16(in28, final_rounding); 3808 in29 = _mm_adds_epi16(in29, final_rounding); 3809 in30 = _mm_adds_epi16(in30, final_rounding); 3810 in31 = _mm_adds_epi16(in31, final_rounding); 3811 3812 in0 = _mm_srai_epi16(in0, 6); 3813 in1 = _mm_srai_epi16(in1, 6); 3814 in2 = _mm_srai_epi16(in2, 6); 3815 in3 = _mm_srai_epi16(in3, 6); 3816 in4 = _mm_srai_epi16(in4, 6); 3817 in5 = _mm_srai_epi16(in5, 6); 3818 in6 = _mm_srai_epi16(in6, 6); 3819 in7 = _mm_srai_epi16(in7, 6); 3820 in8 = _mm_srai_epi16(in8, 6); 3821 in9 = _mm_srai_epi16(in9, 6); 3822 in10 = _mm_srai_epi16(in10, 6); 3823 in11 = _mm_srai_epi16(in11, 6); 3824 in12 = _mm_srai_epi16(in12, 6); 3825 in13 = _mm_srai_epi16(in13, 6); 3826 in14 = _mm_srai_epi16(in14, 6); 3827 in15 = _mm_srai_epi16(in15, 6); 3828 in16 = _mm_srai_epi16(in16, 6); 3829 in17 = _mm_srai_epi16(in17, 6); 3830 in18 = _mm_srai_epi16(in18, 6); 3831 in19 = _mm_srai_epi16(in19, 6); 3832 in20 = _mm_srai_epi16(in20, 6); 3833 in21 = _mm_srai_epi16(in21, 6); 3834 in22 = _mm_srai_epi16(in22, 6); 3835 in23 = _mm_srai_epi16(in23, 6); 3836 in24 = _mm_srai_epi16(in24, 6); 3837 in25 = _mm_srai_epi16(in25, 6); 3838 in26 = _mm_srai_epi16(in26, 6); 3839 in27 = _mm_srai_epi16(in27, 6); 3840 in28 = _mm_srai_epi16(in28, 6); 3841 in29 = _mm_srai_epi16(in29, 6); 3842 in30 = _mm_srai_epi16(in30, 6); 3843 in31 = _mm_srai_epi16(in31, 6); 3844 3845 RECON_AND_STORE(dest, in0); 3846 RECON_AND_STORE(dest, in1); 3847 RECON_AND_STORE(dest, in2); 3848 RECON_AND_STORE(dest, in3); 3849 RECON_AND_STORE(dest, in4); 3850 RECON_AND_STORE(dest, in5); 3851 RECON_AND_STORE(dest, in6); 3852 RECON_AND_STORE(dest, in7); 3853 RECON_AND_STORE(dest, in8); 3854 RECON_AND_STORE(dest, in9); 3855 RECON_AND_STORE(dest, in10); 3856 RECON_AND_STORE(dest, in11); 3857 RECON_AND_STORE(dest, in12); 3858 RECON_AND_STORE(dest, in13); 3859 RECON_AND_STORE(dest, in14); 3860 RECON_AND_STORE(dest, in15); 3861 RECON_AND_STORE(dest, in16); 3862 RECON_AND_STORE(dest, in17); 3863 RECON_AND_STORE(dest, in18); 3864 RECON_AND_STORE(dest, in19); 3865 RECON_AND_STORE(dest, in20); 3866 RECON_AND_STORE(dest, in21); 3867 RECON_AND_STORE(dest, in22); 3868 RECON_AND_STORE(dest, in23); 3869 RECON_AND_STORE(dest, in24); 3870 RECON_AND_STORE(dest, in25); 3871 RECON_AND_STORE(dest, in26); 3872 RECON_AND_STORE(dest, in27); 3873 RECON_AND_STORE(dest, in28); 3874 RECON_AND_STORE(dest, in29); 3875 RECON_AND_STORE(dest, in30); 3876 RECON_AND_STORE(dest, in31); 3877 3878 dest += 8 - (stride * 32); 3879 } 3880 } 3881} //NOLINT 3882 3883void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 3884 __m128i dc_value; 3885 const __m128i zero = _mm_setzero_si128(); 3886 int a, i; 3887 3888 a = dct_const_round_shift(input[0] * cospi_16_64); 3889 a = dct_const_round_shift(a * cospi_16_64); 3890 a = ROUND_POWER_OF_TWO(a, 6); 3891 3892 dc_value = _mm_set1_epi16(a); 3893 3894 for (i = 0; i < 4; ++i) { 3895 RECON_AND_STORE(dest, dc_value); 3896 RECON_AND_STORE(dest, dc_value); 3897 RECON_AND_STORE(dest, dc_value); 3898 RECON_AND_STORE(dest, dc_value); 3899 RECON_AND_STORE(dest, dc_value); 3900 RECON_AND_STORE(dest, dc_value); 3901 RECON_AND_STORE(dest, dc_value); 3902 RECON_AND_STORE(dest, dc_value); 3903 RECON_AND_STORE(dest, dc_value); 3904 RECON_AND_STORE(dest, dc_value); 3905 RECON_AND_STORE(dest, dc_value); 3906 RECON_AND_STORE(dest, dc_value); 3907 RECON_AND_STORE(dest, dc_value); 3908 RECON_AND_STORE(dest, dc_value); 3909 RECON_AND_STORE(dest, dc_value); 3910 RECON_AND_STORE(dest, dc_value); 3911 RECON_AND_STORE(dest, dc_value); 3912 RECON_AND_STORE(dest, dc_value); 3913 RECON_AND_STORE(dest, dc_value); 3914 RECON_AND_STORE(dest, dc_value); 3915 RECON_AND_STORE(dest, dc_value); 3916 RECON_AND_STORE(dest, dc_value); 3917 RECON_AND_STORE(dest, dc_value); 3918 RECON_AND_STORE(dest, dc_value); 3919 RECON_AND_STORE(dest, dc_value); 3920 RECON_AND_STORE(dest, dc_value); 3921 RECON_AND_STORE(dest, dc_value); 3922 RECON_AND_STORE(dest, dc_value); 3923 RECON_AND_STORE(dest, dc_value); 3924 RECON_AND_STORE(dest, dc_value); 3925 RECON_AND_STORE(dest, dc_value); 3926 RECON_AND_STORE(dest, dc_value); 3927 dest += 8 - (stride * 32); 3928 } 3929} 3930