fwd_txfm_sse2.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_ 12#define VPX_DSP_X86_FWD_TXFM_SSE2_H_ 13 14#ifdef __cplusplus 15extern "C" { 16#endif 17 18#define pair_set_epi32(a, b) \ 19 _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) 20 21static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { 22 __m128i buf0, buf1; 23 buf0 = _mm_mul_epu32(a, b); 24 a = _mm_srli_epi64(a, 32); 25 b = _mm_srli_epi64(b, 32); 26 buf1 = _mm_mul_epu32(a, b); 27 return _mm_add_epi64(buf0, buf1); 28} 29 30static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { 31 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); 32 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); 33 return _mm_unpacklo_epi64(buf0, buf1); 34} 35 36static INLINE int check_epi16_overflow_x2(const __m128i *preg0, 37 const __m128i *preg1) { 38 const __m128i max_overflow = _mm_set1_epi16(0x7fff); 39 const __m128i min_overflow = _mm_set1_epi16(0x8000); 40 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), 41 _mm_cmpeq_epi16(*preg0, min_overflow)); 42 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), 43 _mm_cmpeq_epi16(*preg1, min_overflow)); 44 cmp0 = _mm_or_si128(cmp0, cmp1); 45 return _mm_movemask_epi8(cmp0); 46} 47 48static INLINE int check_epi16_overflow_x4(const __m128i *preg0, 49 const __m128i *preg1, 50 const __m128i *preg2, 51 const __m128i *preg3) { 52 const __m128i max_overflow = _mm_set1_epi16(0x7fff); 53 const __m128i min_overflow = _mm_set1_epi16(0x8000); 54 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), 55 _mm_cmpeq_epi16(*preg0, min_overflow)); 56 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), 57 _mm_cmpeq_epi16(*preg1, min_overflow)); 58 __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), 59 _mm_cmpeq_epi16(*preg2, min_overflow)); 60 __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), 61 _mm_cmpeq_epi16(*preg3, min_overflow)); 62 cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); 63 return _mm_movemask_epi8(cmp0); 64} 65 66static INLINE int check_epi16_overflow_x8(const __m128i *preg0, 67 const __m128i *preg1, 68 const __m128i *preg2, 69 const __m128i *preg3, 70 const __m128i *preg4, 71 const __m128i *preg5, 72 const __m128i *preg6, 73 const __m128i *preg7) { 74 int res0, res1; 75 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 76 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 77 return res0 + res1; 78} 79 80static INLINE int check_epi16_overflow_x12(const __m128i *preg0, 81 const __m128i *preg1, 82 const __m128i *preg2, 83 const __m128i *preg3, 84 const __m128i *preg4, 85 const __m128i *preg5, 86 const __m128i *preg6, 87 const __m128i *preg7, 88 const __m128i *preg8, 89 const __m128i *preg9, 90 const __m128i *preg10, 91 const __m128i *preg11) { 92 int res0, res1; 93 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 94 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 95 if (!res0) 96 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 97 return res0 + res1; 98} 99 100static INLINE int check_epi16_overflow_x16(const __m128i *preg0, 101 const __m128i *preg1, 102 const __m128i *preg2, 103 const __m128i *preg3, 104 const __m128i *preg4, 105 const __m128i *preg5, 106 const __m128i *preg6, 107 const __m128i *preg7, 108 const __m128i *preg8, 109 const __m128i *preg9, 110 const __m128i *preg10, 111 const __m128i *preg11, 112 const __m128i *preg12, 113 const __m128i *preg13, 114 const __m128i *preg14, 115 const __m128i *preg15) { 116 int res0, res1; 117 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 118 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 119 if (!res0) { 120 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 121 if (!res1) 122 res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); 123 } 124 return res0 + res1; 125} 126 127static INLINE int check_epi16_overflow_x32(const __m128i *preg0, 128 const __m128i *preg1, 129 const __m128i *preg2, 130 const __m128i *preg3, 131 const __m128i *preg4, 132 const __m128i *preg5, 133 const __m128i *preg6, 134 const __m128i *preg7, 135 const __m128i *preg8, 136 const __m128i *preg9, 137 const __m128i *preg10, 138 const __m128i *preg11, 139 const __m128i *preg12, 140 const __m128i *preg13, 141 const __m128i *preg14, 142 const __m128i *preg15, 143 const __m128i *preg16, 144 const __m128i *preg17, 145 const __m128i *preg18, 146 const __m128i *preg19, 147 const __m128i *preg20, 148 const __m128i *preg21, 149 const __m128i *preg22, 150 const __m128i *preg23, 151 const __m128i *preg24, 152 const __m128i *preg25, 153 const __m128i *preg26, 154 const __m128i *preg27, 155 const __m128i *preg28, 156 const __m128i *preg29, 157 const __m128i *preg30, 158 const __m128i *preg31) { 159 int res0, res1; 160 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 161 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 162 if (!res0) { 163 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 164 if (!res1) { 165 res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); 166 if (!res0) { 167 res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); 168 if (!res1) { 169 res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); 170 if (!res0) { 171 res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); 172 if (!res1) 173 res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); 174 } 175 } 176 } 177 } 178 } 179 return res0 + res1; 180} 181 182static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, 183 const __m128i *preg1, 184 const __m128i *preg2, 185 const __m128i *preg3, 186 const __m128i *zero) { 187 __m128i minus_one = _mm_set1_epi32(-1); 188 // Check for overflows 189 __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); 190 __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); 191 __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); 192 __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); 193 __m128i reg0_top_dwords = _mm_shuffle_epi32( 194 reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); 195 __m128i reg1_top_dwords = _mm_shuffle_epi32( 196 reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); 197 __m128i reg2_top_dwords = _mm_shuffle_epi32( 198 reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); 199 __m128i reg3_top_dwords = _mm_shuffle_epi32( 200 reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); 201 __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); 202 __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); 203 __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); 204 __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); 205 __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); 206 __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); 207 int overflow_01 = _mm_movemask_epi8( 208 _mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); 209 int overflow_23 = _mm_movemask_epi8( 210 _mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); 211 return (overflow_01 + overflow_23); 212} 213 214static INLINE int k_check_epi32_overflow_8(const __m128i *preg0, 215 const __m128i *preg1, 216 const __m128i *preg2, 217 const __m128i *preg3, 218 const __m128i *preg4, 219 const __m128i *preg5, 220 const __m128i *preg6, 221 const __m128i *preg7, 222 const __m128i *zero) { 223 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); 224 if (!overflow) { 225 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); 226 } 227 return overflow; 228} 229 230static INLINE int k_check_epi32_overflow_16(const __m128i *preg0, 231 const __m128i *preg1, 232 const __m128i *preg2, 233 const __m128i *preg3, 234 const __m128i *preg4, 235 const __m128i *preg5, 236 const __m128i *preg6, 237 const __m128i *preg7, 238 const __m128i *preg8, 239 const __m128i *preg9, 240 const __m128i *preg10, 241 const __m128i *preg11, 242 const __m128i *preg12, 243 const __m128i *preg13, 244 const __m128i *preg14, 245 const __m128i *preg15, 246 const __m128i *zero) { 247 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); 248 if (!overflow) { 249 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); 250 if (!overflow) { 251 overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, 252 zero); 253 if (!overflow) { 254 overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, 255 zero); 256 } 257 } 258 } 259 return overflow; 260} 261 262static INLINE int k_check_epi32_overflow_32(const __m128i *preg0, 263 const __m128i *preg1, 264 const __m128i *preg2, 265 const __m128i *preg3, 266 const __m128i *preg4, 267 const __m128i *preg5, 268 const __m128i *preg6, 269 const __m128i *preg7, 270 const __m128i *preg8, 271 const __m128i *preg9, 272 const __m128i *preg10, 273 const __m128i *preg11, 274 const __m128i *preg12, 275 const __m128i *preg13, 276 const __m128i *preg14, 277 const __m128i *preg15, 278 const __m128i *preg16, 279 const __m128i *preg17, 280 const __m128i *preg18, 281 const __m128i *preg19, 282 const __m128i *preg20, 283 const __m128i *preg21, 284 const __m128i *preg22, 285 const __m128i *preg23, 286 const __m128i *preg24, 287 const __m128i *preg25, 288 const __m128i *preg26, 289 const __m128i *preg27, 290 const __m128i *preg28, 291 const __m128i *preg29, 292 const __m128i *preg30, 293 const __m128i *preg31, 294 const __m128i *zero) { 295 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); 296 if (!overflow) { 297 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); 298 if (!overflow) { 299 overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); 300 if (!overflow) { 301 overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, 302 zero); 303 if (!overflow) { 304 overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, 305 zero); 306 if (!overflow) { 307 overflow = k_check_epi32_overflow_4(preg20, preg21, 308 preg22, preg23, zero); 309 if (!overflow) { 310 overflow = k_check_epi32_overflow_4(preg24, preg25, 311 preg26, preg27, zero); 312 if (!overflow) { 313 overflow = k_check_epi32_overflow_4(preg28, preg29, 314 preg30, preg31, zero); 315 } 316 } 317 } 318 } 319 } 320 } 321 } 322 return overflow; 323} 324 325static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) { 326#if CONFIG_VP9_HIGHBITDEPTH 327 const __m128i zero = _mm_setzero_si128(); 328 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); 329 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); 330 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); 331 _mm_store_si128((__m128i *)(dst_ptr), out0); 332 _mm_store_si128((__m128i *)(dst_ptr + 4), out1); 333#else 334 _mm_store_si128((__m128i *)(dst_ptr), *poutput); 335#endif // CONFIG_VP9_HIGHBITDEPTH 336} 337 338static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) { 339#if CONFIG_VP9_HIGHBITDEPTH 340 const __m128i zero = _mm_setzero_si128(); 341 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); 342 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); 343 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); 344 _mm_storeu_si128((__m128i *)(dst_ptr), out0); 345 _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); 346#else 347 _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); 348#endif // CONFIG_VP9_HIGHBITDEPTH 349} 350 351 352static INLINE __m128i mult_round_shift(const __m128i *pin0, 353 const __m128i *pin1, 354 const __m128i *pmultiplier, 355 const __m128i *prounding, 356 const int shift) { 357 const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); 358 const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); 359 const __m128i v0 = _mm_add_epi32(u0, *prounding); 360 const __m128i v1 = _mm_add_epi32(u1, *prounding); 361 const __m128i w0 = _mm_srai_epi32(v0, shift); 362 const __m128i w1 = _mm_srai_epi32(v1, shift); 363 return _mm_packs_epi32(w0, w1); 364} 365 366static INLINE void transpose_and_output8x8( 367 const __m128i *pin00, const __m128i *pin01, 368 const __m128i *pin02, const __m128i *pin03, 369 const __m128i *pin04, const __m128i *pin05, 370 const __m128i *pin06, const __m128i *pin07, 371 const int pass, int16_t* out0_ptr, 372 tran_low_t* out1_ptr) { 373 // 00 01 02 03 04 05 06 07 374 // 10 11 12 13 14 15 16 17 375 // 20 21 22 23 24 25 26 27 376 // 30 31 32 33 34 35 36 37 377 // 40 41 42 43 44 45 46 47 378 // 50 51 52 53 54 55 56 57 379 // 60 61 62 63 64 65 66 67 380 // 70 71 72 73 74 75 76 77 381 const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); 382 const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); 383 const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); 384 const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); 385 const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); 386 const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); 387 const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); 388 const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); 389 // 00 10 01 11 02 12 03 13 390 // 20 30 21 31 22 32 23 33 391 // 04 14 05 15 06 16 07 17 392 // 24 34 25 35 26 36 27 37 393 // 40 50 41 51 42 52 43 53 394 // 60 70 61 71 62 72 63 73 395 // 54 54 55 55 56 56 57 57 396 // 64 74 65 75 66 76 67 77 397 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 398 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 399 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 400 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 401 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 402 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 403 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 404 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 405 // 00 10 20 30 01 11 21 31 406 // 40 50 60 70 41 51 61 71 407 // 02 12 22 32 03 13 23 33 408 // 42 52 62 72 43 53 63 73 409 // 04 14 24 34 05 15 21 36 410 // 44 54 64 74 45 55 61 76 411 // 06 16 26 36 07 17 27 37 412 // 46 56 66 76 47 57 67 77 413 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 414 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 415 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 416 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 417 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 418 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 419 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 420 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 421 // 00 10 20 30 40 50 60 70 422 // 01 11 21 31 41 51 61 71 423 // 02 12 22 32 42 52 62 72 424 // 03 13 23 33 43 53 63 73 425 // 04 14 24 34 44 54 64 74 426 // 05 15 25 35 45 55 65 75 427 // 06 16 26 36 46 56 66 76 428 // 07 17 27 37 47 57 67 77 429 if (pass == 0) { 430 _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0); 431 _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1); 432 _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2); 433 _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3); 434 _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4); 435 _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5); 436 _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6); 437 _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7); 438 } else { 439 storeu_output(&tr2_0, (out1_ptr + 0 * 16)); 440 storeu_output(&tr2_1, (out1_ptr + 1 * 16)); 441 storeu_output(&tr2_2, (out1_ptr + 2 * 16)); 442 storeu_output(&tr2_3, (out1_ptr + 3 * 16)); 443 storeu_output(&tr2_4, (out1_ptr + 4 * 16)); 444 storeu_output(&tr2_5, (out1_ptr + 5 * 16)); 445 storeu_output(&tr2_6, (out1_ptr + 6 * 16)); 446 storeu_output(&tr2_7, (out1_ptr + 7 * 16)); 447 } 448} 449 450#ifdef __cplusplus 451} // extern "C" 452#endif 453 454#endif // VPX_DSP_X86_FWD_TXFM_SSE2_H_ 455