1/* Copyright (C) 2008-2013 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 11.0. */ 26 27#ifndef _IMMINTRIN_H_INCLUDED 28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 29#endif 30 31/* Internal data types for implementing the intrinsics. */ 32typedef double __v4df __attribute__ ((__vector_size__ (32))); 33typedef float __v8sf __attribute__ ((__vector_size__ (32))); 34typedef long long __v4di __attribute__ ((__vector_size__ (32))); 35typedef int __v8si __attribute__ ((__vector_size__ (32))); 36typedef short __v16hi __attribute__ ((__vector_size__ (32))); 37typedef char __v32qi __attribute__ ((__vector_size__ (32))); 38 39/* The Intel API is flexible enough that we must allow aliasing with other 40 vector types, and their scalar components. */ 41typedef float __m256 __attribute__ ((__vector_size__ (32), 42 __may_alias__)); 43typedef long long __m256i __attribute__ ((__vector_size__ (32), 44 __may_alias__)); 45typedef double __m256d __attribute__ ((__vector_size__ (32), 46 __may_alias__)); 47 48/* Compare predicates for scalar and packed compare intrinsics. */ 49 50/* Equal (ordered, non-signaling) */ 51#define _CMP_EQ_OQ 0x00 52/* Less-than (ordered, signaling) */ 53#define _CMP_LT_OS 0x01 54/* Less-than-or-equal (ordered, signaling) */ 55#define _CMP_LE_OS 0x02 56/* Unordered (non-signaling) */ 57#define _CMP_UNORD_Q 0x03 58/* Not-equal (unordered, non-signaling) */ 59#define _CMP_NEQ_UQ 0x04 60/* Not-less-than (unordered, signaling) */ 61#define _CMP_NLT_US 0x05 62/* Not-less-than-or-equal (unordered, signaling) */ 63#define _CMP_NLE_US 0x06 64/* Ordered (nonsignaling) */ 65#define _CMP_ORD_Q 0x07 66/* Equal (unordered, non-signaling) */ 67#define _CMP_EQ_UQ 0x08 68/* Not-greater-than-or-equal (unordered, signaling) */ 69#define _CMP_NGE_US 0x09 70/* Not-greater-than (unordered, signaling) */ 71#define _CMP_NGT_US 0x0a 72/* False (ordered, non-signaling) */ 73#define _CMP_FALSE_OQ 0x0b 74/* Not-equal (ordered, non-signaling) */ 75#define _CMP_NEQ_OQ 0x0c 76/* Greater-than-or-equal (ordered, signaling) */ 77#define _CMP_GE_OS 0x0d 78/* Greater-than (ordered, signaling) */ 79#define _CMP_GT_OS 0x0e 80/* True (unordered, non-signaling) */ 81#define _CMP_TRUE_UQ 0x0f 82/* Equal (ordered, signaling) */ 83#define _CMP_EQ_OS 0x10 84/* Less-than (ordered, non-signaling) */ 85#define _CMP_LT_OQ 0x11 86/* Less-than-or-equal (ordered, non-signaling) */ 87#define _CMP_LE_OQ 0x12 88/* Unordered (signaling) */ 89#define _CMP_UNORD_S 0x13 90/* Not-equal (unordered, signaling) */ 91#define _CMP_NEQ_US 0x14 92/* Not-less-than (unordered, non-signaling) */ 93#define _CMP_NLT_UQ 0x15 94/* Not-less-than-or-equal (unordered, non-signaling) */ 95#define _CMP_NLE_UQ 0x16 96/* Ordered (signaling) */ 97#define _CMP_ORD_S 0x17 98/* Equal (unordered, signaling) */ 99#define _CMP_EQ_US 0x18 100/* Not-greater-than-or-equal (unordered, non-signaling) */ 101#define _CMP_NGE_UQ 0x19 102/* Not-greater-than (unordered, non-signaling) */ 103#define _CMP_NGT_UQ 0x1a 104/* False (ordered, signaling) */ 105#define _CMP_FALSE_OS 0x1b 106/* Not-equal (ordered, signaling) */ 107#define _CMP_NEQ_OS 0x1c 108/* Greater-than-or-equal (ordered, non-signaling) */ 109#define _CMP_GE_OQ 0x1d 110/* Greater-than (ordered, non-signaling) */ 111#define _CMP_GT_OQ 0x1e 112/* True (unordered, signaling) */ 113#define _CMP_TRUE_US 0x1f 114 115extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 116_mm256_add_pd (__m256d __A, __m256d __B) 117{ 118 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); 119} 120 121extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 122_mm256_add_ps (__m256 __A, __m256 __B) 123{ 124 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); 125} 126 127extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 128_mm256_addsub_pd (__m256d __A, __m256d __B) 129{ 130 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); 131} 132 133extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 134_mm256_addsub_ps (__m256 __A, __m256 __B) 135{ 136 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); 137} 138 139 140extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 141_mm256_and_pd (__m256d __A, __m256d __B) 142{ 143 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); 144} 145 146extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147_mm256_and_ps (__m256 __A, __m256 __B) 148{ 149 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); 150} 151 152extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153_mm256_andnot_pd (__m256d __A, __m256d __B) 154{ 155 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); 156} 157 158extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159_mm256_andnot_ps (__m256 __A, __m256 __B) 160{ 161 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); 162} 163 164/* Double/single precision floating point blend instructions - select 165 data from 2 sources using constant/variable mask. */ 166 167#ifdef __OPTIMIZE__ 168extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 169_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) 170{ 171 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, 172 (__v4df)__Y, 173 __M); 174} 175 176extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) 178{ 179 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, 180 (__v8sf)__Y, 181 __M); 182} 183#else 184#define _mm256_blend_pd(X, Y, M) \ 185 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ 186 (__v4df)(__m256d)(Y), (int)(M))) 187 188#define _mm256_blend_ps(X, Y, M) \ 189 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ 190 (__v8sf)(__m256)(Y), (int)(M))) 191#endif 192 193extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) 195{ 196 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, 197 (__v4df)__Y, 198 (__v4df)__M); 199} 200 201extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 202_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) 203{ 204 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, 205 (__v8sf)__Y, 206 (__v8sf)__M); 207} 208 209extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 210_mm256_div_pd (__m256d __A, __m256d __B) 211{ 212 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); 213} 214 215extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 216_mm256_div_ps (__m256 __A, __m256 __B) 217{ 218 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); 219} 220 221/* Dot product instructions with mask-defined summing and zeroing parts 222 of result. */ 223 224#ifdef __OPTIMIZE__ 225extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) 227{ 228 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, 229 (__v8sf)__Y, 230 __M); 231} 232#else 233#define _mm256_dp_ps(X, Y, M) \ 234 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ 235 (__v8sf)(__m256)(Y), (int)(M))) 236#endif 237 238extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239_mm256_hadd_pd (__m256d __X, __m256d __Y) 240{ 241 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); 242} 243 244extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 245_mm256_hadd_ps (__m256 __X, __m256 __Y) 246{ 247 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); 248} 249 250extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251_mm256_hsub_pd (__m256d __X, __m256d __Y) 252{ 253 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); 254} 255 256extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257_mm256_hsub_ps (__m256 __X, __m256 __Y) 258{ 259 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); 260} 261 262extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263_mm256_max_pd (__m256d __A, __m256d __B) 264{ 265 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); 266} 267 268extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 269_mm256_max_ps (__m256 __A, __m256 __B) 270{ 271 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); 272} 273 274extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 275_mm256_min_pd (__m256d __A, __m256d __B) 276{ 277 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); 278} 279 280extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281_mm256_min_ps (__m256 __A, __m256 __B) 282{ 283 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); 284} 285 286extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287_mm256_mul_pd (__m256d __A, __m256d __B) 288{ 289 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); 290} 291 292extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293_mm256_mul_ps (__m256 __A, __m256 __B) 294{ 295 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); 296} 297 298extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 299_mm256_or_pd (__m256d __A, __m256d __B) 300{ 301 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); 302} 303 304extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 305_mm256_or_ps (__m256 __A, __m256 __B) 306{ 307 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); 308} 309 310#ifdef __OPTIMIZE__ 311extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 312_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) 313{ 314 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, 315 __mask); 316} 317 318extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 319_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) 320{ 321 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, 322 __mask); 323} 324#else 325#define _mm256_shuffle_pd(A, B, N) \ 326 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ 327 (__v4df)(__m256d)(B), (int)(N))) 328 329#define _mm256_shuffle_ps(A, B, N) \ 330 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ 331 (__v8sf)(__m256)(B), (int)(N))) 332#endif 333 334extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 335_mm256_sub_pd (__m256d __A, __m256d __B) 336{ 337 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); 338} 339 340extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 341_mm256_sub_ps (__m256 __A, __m256 __B) 342{ 343 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); 344} 345 346extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 347_mm256_xor_pd (__m256d __A, __m256d __B) 348{ 349 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); 350} 351 352extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 353_mm256_xor_ps (__m256 __A, __m256 __B) 354{ 355 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); 356} 357 358#ifdef __OPTIMIZE__ 359extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) 361{ 362 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); 363} 364 365extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 366_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) 367{ 368 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); 369} 370 371extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 372_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) 373{ 374 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, 375 __P); 376} 377 378extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) 380{ 381 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, 382 __P); 383} 384 385extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) 387{ 388 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); 389} 390 391extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) 393{ 394 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); 395} 396#else 397#define _mm_cmp_pd(X, Y, P) \ 398 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ 399 (__v2df)(__m128d)(Y), (int)(P))) 400 401#define _mm_cmp_ps(X, Y, P) \ 402 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ 403 (__v4sf)(__m128)(Y), (int)(P))) 404 405#define _mm256_cmp_pd(X, Y, P) \ 406 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ 407 (__v4df)(__m256d)(Y), (int)(P))) 408 409#define _mm256_cmp_ps(X, Y, P) \ 410 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ 411 (__v8sf)(__m256)(Y), (int)(P))) 412 413#define _mm_cmp_sd(X, Y, P) \ 414 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ 415 (__v2df)(__m128d)(Y), (int)(P))) 416 417#define _mm_cmp_ss(X, Y, P) \ 418 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ 419 (__v4sf)(__m128)(Y), (int)(P))) 420#endif 421 422extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 423_mm256_cvtepi32_pd (__m128i __A) 424{ 425 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); 426} 427 428extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 429_mm256_cvtepi32_ps (__m256i __A) 430{ 431 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); 432} 433 434extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 435_mm256_cvtpd_ps (__m256d __A) 436{ 437 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); 438} 439 440extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 441_mm256_cvtps_epi32 (__m256 __A) 442{ 443 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); 444} 445 446extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 447_mm256_cvtps_pd (__m128 __A) 448{ 449 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); 450} 451 452extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 453_mm256_cvttpd_epi32 (__m256d __A) 454{ 455 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); 456} 457 458extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 459_mm256_cvtpd_epi32 (__m256d __A) 460{ 461 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); 462} 463 464extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465_mm256_cvttps_epi32 (__m256 __A) 466{ 467 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); 468} 469 470#ifdef __OPTIMIZE__ 471extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 472_mm256_extractf128_pd (__m256d __X, const int __N) 473{ 474 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); 475} 476 477extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478_mm256_extractf128_ps (__m256 __X, const int __N) 479{ 480 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); 481} 482 483extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 484_mm256_extractf128_si256 (__m256i __X, const int __N) 485{ 486 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); 487} 488 489extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 490_mm256_extract_epi32 (__m256i __X, int const __N) 491{ 492 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 493 return _mm_extract_epi32 (__Y, __N % 4); 494} 495 496extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497_mm256_extract_epi16 (__m256i __X, int const __N) 498{ 499 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 500 return _mm_extract_epi16 (__Y, __N % 8); 501} 502 503extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 504_mm256_extract_epi8 (__m256i __X, int const __N) 505{ 506 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 507 return _mm_extract_epi8 (__Y, __N % 16); 508} 509 510#ifdef __x86_64__ 511extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512_mm256_extract_epi64 (__m256i __X, const int __N) 513{ 514 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 515 return _mm_extract_epi64 (__Y, __N % 2); 516} 517#endif 518#else 519#define _mm256_extractf128_pd(X, N) \ 520 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ 521 (int)(N))) 522 523#define _mm256_extractf128_ps(X, N) \ 524 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ 525 (int)(N))) 526 527#define _mm256_extractf128_si256(X, N) \ 528 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ 529 (int)(N))) 530 531#define _mm256_extract_epi32(X, N) \ 532 (__extension__ \ 533 ({ \ 534 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 535 _mm_extract_epi32 (__Y, (N) % 4); \ 536 })) 537 538#define _mm256_extract_epi16(X, N) \ 539 (__extension__ \ 540 ({ \ 541 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 542 _mm_extract_epi16 (__Y, (N) % 8); \ 543 })) 544 545#define _mm256_extract_epi8(X, N) \ 546 (__extension__ \ 547 ({ \ 548 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 549 _mm_extract_epi8 (__Y, (N) % 16); \ 550 })) 551 552#ifdef __x86_64__ 553#define _mm256_extract_epi64(X, N) \ 554 (__extension__ \ 555 ({ \ 556 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 557 _mm_extract_epi64 (__Y, (N) % 2); \ 558 })) 559#endif 560#endif 561 562extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 563_mm256_zeroall (void) 564{ 565 __builtin_ia32_vzeroall (); 566} 567 568extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569_mm256_zeroupper (void) 570{ 571 __builtin_ia32_vzeroupper (); 572} 573 574extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 575_mm_permutevar_pd (__m128d __A, __m128i __C) 576{ 577 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, 578 (__v2di)__C); 579} 580 581extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 582_mm256_permutevar_pd (__m256d __A, __m256i __C) 583{ 584 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, 585 (__v4di)__C); 586} 587 588extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589_mm_permutevar_ps (__m128 __A, __m128i __C) 590{ 591 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, 592 (__v4si)__C); 593} 594 595extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 596_mm256_permutevar_ps (__m256 __A, __m256i __C) 597{ 598 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, 599 (__v8si)__C); 600} 601 602#ifdef __OPTIMIZE__ 603extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604_mm_permute_pd (__m128d __X, const int __C) 605{ 606 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); 607} 608 609extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610_mm256_permute_pd (__m256d __X, const int __C) 611{ 612 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); 613} 614 615extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616_mm_permute_ps (__m128 __X, const int __C) 617{ 618 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); 619} 620 621extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622_mm256_permute_ps (__m256 __X, const int __C) 623{ 624 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); 625} 626#else 627#define _mm_permute_pd(X, C) \ 628 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) 629 630#define _mm256_permute_pd(X, C) \ 631 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) 632 633#define _mm_permute_ps(X, C) \ 634 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) 635 636#define _mm256_permute_ps(X, C) \ 637 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) 638#endif 639 640#ifdef __OPTIMIZE__ 641extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 642_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) 643{ 644 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, 645 (__v4df)__Y, 646 __C); 647} 648 649extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 650_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) 651{ 652 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, 653 (__v8sf)__Y, 654 __C); 655} 656 657extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) 659{ 660 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, 661 (__v8si)__Y, 662 __C); 663} 664#else 665#define _mm256_permute2f128_pd(X, Y, C) \ 666 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ 667 (__v4df)(__m256d)(Y), \ 668 (int)(C))) 669 670#define _mm256_permute2f128_ps(X, Y, C) \ 671 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ 672 (__v8sf)(__m256)(Y), \ 673 (int)(C))) 674 675#define _mm256_permute2f128_si256(X, Y, C) \ 676 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ 677 (__v8si)(__m256i)(Y), \ 678 (int)(C))) 679#endif 680 681extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682_mm_broadcast_ss (float const *__X) 683{ 684 return (__m128) __builtin_ia32_vbroadcastss (__X); 685} 686 687extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 688_mm256_broadcast_sd (double const *__X) 689{ 690 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); 691} 692 693extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 694_mm256_broadcast_ss (float const *__X) 695{ 696 return (__m256) __builtin_ia32_vbroadcastss256 (__X); 697} 698 699extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700_mm256_broadcast_pd (__m128d const *__X) 701{ 702 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); 703} 704 705extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 706_mm256_broadcast_ps (__m128 const *__X) 707{ 708 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); 709} 710 711#ifdef __OPTIMIZE__ 712extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) 714{ 715 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, 716 (__v2df)__Y, 717 __O); 718} 719 720extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) 722{ 723 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, 724 (__v4sf)__Y, 725 __O); 726} 727 728extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) 730{ 731 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, 732 (__v4si)__Y, 733 __O); 734} 735 736extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 737_mm256_insert_epi32 (__m256i __X, int __D, int const __N) 738{ 739 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 740 __Y = _mm_insert_epi32 (__Y, __D, __N % 4); 741 return _mm256_insertf128_si256 (__X, __Y, __N >> 2); 742} 743 744extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 745_mm256_insert_epi16 (__m256i __X, int __D, int const __N) 746{ 747 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 748 __Y = _mm_insert_epi16 (__Y, __D, __N % 8); 749 return _mm256_insertf128_si256 (__X, __Y, __N >> 3); 750} 751 752extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 753_mm256_insert_epi8 (__m256i __X, int __D, int const __N) 754{ 755 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 756 __Y = _mm_insert_epi8 (__Y, __D, __N % 16); 757 return _mm256_insertf128_si256 (__X, __Y, __N >> 4); 758} 759 760#ifdef __x86_64__ 761extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 762_mm256_insert_epi64 (__m256i __X, long long __D, int const __N) 763{ 764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 765 __Y = _mm_insert_epi64 (__Y, __D, __N % 2); 766 return _mm256_insertf128_si256 (__X, __Y, __N >> 1); 767} 768#endif 769#else 770#define _mm256_insertf128_pd(X, Y, O) \ 771 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ 772 (__v2df)(__m128d)(Y), \ 773 (int)(O))) 774 775#define _mm256_insertf128_ps(X, Y, O) \ 776 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ 777 (__v4sf)(__m128)(Y), \ 778 (int)(O))) 779 780#define _mm256_insertf128_si256(X, Y, O) \ 781 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ 782 (__v4si)(__m128i)(Y), \ 783 (int)(O))) 784 785#define _mm256_insert_epi32(X, D, N) \ 786 (__extension__ \ 787 ({ \ 788 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 789 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ 790 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ 791 })) 792 793#define _mm256_insert_epi16(X, D, N) \ 794 (__extension__ \ 795 ({ \ 796 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 797 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ 798 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ 799 })) 800 801#define _mm256_insert_epi8(X, D, N) \ 802 (__extension__ \ 803 ({ \ 804 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 805 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ 806 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ 807 })) 808 809#ifdef __x86_64__ 810#define _mm256_insert_epi64(X, D, N) \ 811 (__extension__ \ 812 ({ \ 813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 814 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ 815 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ 816 })) 817#endif 818#endif 819 820extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 821_mm256_load_pd (double const *__P) 822{ 823 return *(__m256d *)__P; 824} 825 826extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 827_mm256_store_pd (double *__P, __m256d __A) 828{ 829 *(__m256d *)__P = __A; 830} 831 832extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 833_mm256_load_ps (float const *__P) 834{ 835 return *(__m256 *)__P; 836} 837 838extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 839_mm256_store_ps (float *__P, __m256 __A) 840{ 841 *(__m256 *)__P = __A; 842} 843 844extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 845_mm256_loadu_pd (double const *__P) 846{ 847 return (__m256d) __builtin_ia32_loadupd256 (__P); 848} 849 850extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 851_mm256_storeu_pd (double *__P, __m256d __A) 852{ 853 __builtin_ia32_storeupd256 (__P, (__v4df)__A); 854} 855 856extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 857_mm256_loadu_ps (float const *__P) 858{ 859 return (__m256) __builtin_ia32_loadups256 (__P); 860} 861 862extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 863_mm256_storeu_ps (float *__P, __m256 __A) 864{ 865 __builtin_ia32_storeups256 (__P, (__v8sf)__A); 866} 867 868extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 869_mm256_load_si256 (__m256i const *__P) 870{ 871 return *__P; 872} 873 874extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 875_mm256_store_si256 (__m256i *__P, __m256i __A) 876{ 877 *__P = __A; 878} 879 880extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 881_mm256_loadu_si256 (__m256i const *__P) 882{ 883 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); 884} 885 886extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 887_mm256_storeu_si256 (__m256i *__P, __m256i __A) 888{ 889 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); 890} 891 892extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 893_mm_maskload_pd (double const *__P, __m128i __M) 894{ 895 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, 896 (__v2di)__M); 897} 898 899extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 900_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) 901{ 902 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); 903} 904 905extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906_mm256_maskload_pd (double const *__P, __m256i __M) 907{ 908 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, 909 (__v4di)__M); 910} 911 912extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 913_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) 914{ 915 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); 916} 917 918extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919_mm_maskload_ps (float const *__P, __m128i __M) 920{ 921 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, 922 (__v4si)__M); 923} 924 925extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 926_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) 927{ 928 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); 929} 930 931extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 932_mm256_maskload_ps (float const *__P, __m256i __M) 933{ 934 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, 935 (__v8si)__M); 936} 937 938extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) 940{ 941 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); 942} 943 944extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 945_mm256_movehdup_ps (__m256 __X) 946{ 947 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); 948} 949 950extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 951_mm256_moveldup_ps (__m256 __X) 952{ 953 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); 954} 955 956extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 957_mm256_movedup_pd (__m256d __X) 958{ 959 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); 960} 961 962extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 963_mm256_lddqu_si256 (__m256i const *__P) 964{ 965 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); 966} 967 968extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 969_mm256_stream_si256 (__m256i *__A, __m256i __B) 970{ 971 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); 972} 973 974extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975_mm256_stream_pd (double *__A, __m256d __B) 976{ 977 __builtin_ia32_movntpd256 (__A, (__v4df)__B); 978} 979 980extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981_mm256_stream_ps (float *__P, __m256 __A) 982{ 983 __builtin_ia32_movntps256 (__P, (__v8sf)__A); 984} 985 986extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 987_mm256_rcp_ps (__m256 __A) 988{ 989 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); 990} 991 992extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 993_mm256_rsqrt_ps (__m256 __A) 994{ 995 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); 996} 997 998extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 999_mm256_sqrt_pd (__m256d __A) 1000{ 1001 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); 1002} 1003 1004extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1005_mm256_sqrt_ps (__m256 __A) 1006{ 1007 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); 1008} 1009 1010#ifdef __OPTIMIZE__ 1011extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1012_mm256_round_pd (__m256d __V, const int __M) 1013{ 1014 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); 1015} 1016 1017extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1018_mm256_round_ps (__m256 __V, const int __M) 1019{ 1020 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); 1021} 1022#else 1023#define _mm256_round_pd(V, M) \ 1024 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) 1025 1026#define _mm256_round_ps(V, M) \ 1027 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) 1028#endif 1029 1030#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) 1031#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) 1032#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) 1033#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) 1034 1035extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1036_mm256_unpackhi_pd (__m256d __A, __m256d __B) 1037{ 1038 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); 1039} 1040 1041extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1042_mm256_unpacklo_pd (__m256d __A, __m256d __B) 1043{ 1044 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); 1045} 1046 1047extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1048_mm256_unpackhi_ps (__m256 __A, __m256 __B) 1049{ 1050 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); 1051} 1052 1053extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1054_mm256_unpacklo_ps (__m256 __A, __m256 __B) 1055{ 1056 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); 1057} 1058 1059extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1060_mm_testz_pd (__m128d __M, __m128d __V) 1061{ 1062 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); 1063} 1064 1065extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1066_mm_testc_pd (__m128d __M, __m128d __V) 1067{ 1068 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); 1069} 1070 1071extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1072_mm_testnzc_pd (__m128d __M, __m128d __V) 1073{ 1074 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); 1075} 1076 1077extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1078_mm_testz_ps (__m128 __M, __m128 __V) 1079{ 1080 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); 1081} 1082 1083extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1084_mm_testc_ps (__m128 __M, __m128 __V) 1085{ 1086 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); 1087} 1088 1089extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1090_mm_testnzc_ps (__m128 __M, __m128 __V) 1091{ 1092 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); 1093} 1094 1095extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1096_mm256_testz_pd (__m256d __M, __m256d __V) 1097{ 1098 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); 1099} 1100 1101extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1102_mm256_testc_pd (__m256d __M, __m256d __V) 1103{ 1104 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); 1105} 1106 1107extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1108_mm256_testnzc_pd (__m256d __M, __m256d __V) 1109{ 1110 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); 1111} 1112 1113extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1114_mm256_testz_ps (__m256 __M, __m256 __V) 1115{ 1116 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); 1117} 1118 1119extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120_mm256_testc_ps (__m256 __M, __m256 __V) 1121{ 1122 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); 1123} 1124 1125extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1126_mm256_testnzc_ps (__m256 __M, __m256 __V) 1127{ 1128 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); 1129} 1130 1131extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1132_mm256_testz_si256 (__m256i __M, __m256i __V) 1133{ 1134 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); 1135} 1136 1137extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1138_mm256_testc_si256 (__m256i __M, __m256i __V) 1139{ 1140 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); 1141} 1142 1143extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144_mm256_testnzc_si256 (__m256i __M, __m256i __V) 1145{ 1146 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); 1147} 1148 1149extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1150_mm256_movemask_pd (__m256d __A) 1151{ 1152 return __builtin_ia32_movmskpd256 ((__v4df)__A); 1153} 1154 1155extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1156_mm256_movemask_ps (__m256 __A) 1157{ 1158 return __builtin_ia32_movmskps256 ((__v8sf)__A); 1159} 1160 1161extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1162_mm256_setzero_pd (void) 1163{ 1164 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; 1165} 1166 1167extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1168_mm256_setzero_ps (void) 1169{ 1170 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, 1171 0.0, 0.0, 0.0, 0.0 }; 1172} 1173 1174extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1175_mm256_setzero_si256 (void) 1176{ 1177 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 1178} 1179 1180/* Create the vector [A B C D]. */ 1181extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1182_mm256_set_pd (double __A, double __B, double __C, double __D) 1183{ 1184 return __extension__ (__m256d){ __D, __C, __B, __A }; 1185} 1186 1187/* Create the vector [A B C D E F G H]. */ 1188extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1189_mm256_set_ps (float __A, float __B, float __C, float __D, 1190 float __E, float __F, float __G, float __H) 1191{ 1192 return __extension__ (__m256){ __H, __G, __F, __E, 1193 __D, __C, __B, __A }; 1194} 1195 1196/* Create the vector [A B C D E F G H]. */ 1197extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1198_mm256_set_epi32 (int __A, int __B, int __C, int __D, 1199 int __E, int __F, int __G, int __H) 1200{ 1201 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, 1202 __D, __C, __B, __A }; 1203} 1204 1205extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, 1207 short __q11, short __q10, short __q09, short __q08, 1208 short __q07, short __q06, short __q05, short __q04, 1209 short __q03, short __q02, short __q01, short __q00) 1210{ 1211 return __extension__ (__m256i)(__v16hi){ 1212 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1213 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 1214 }; 1215} 1216 1217extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1218_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, 1219 char __q27, char __q26, char __q25, char __q24, 1220 char __q23, char __q22, char __q21, char __q20, 1221 char __q19, char __q18, char __q17, char __q16, 1222 char __q15, char __q14, char __q13, char __q12, 1223 char __q11, char __q10, char __q09, char __q08, 1224 char __q07, char __q06, char __q05, char __q04, 1225 char __q03, char __q02, char __q01, char __q00) 1226{ 1227 return __extension__ (__m256i)(__v32qi){ 1228 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1229 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, 1230 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, 1231 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 1232 }; 1233} 1234 1235extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1236_mm256_set_epi64x (long long __A, long long __B, long long __C, 1237 long long __D) 1238{ 1239 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; 1240} 1241 1242/* Create a vector with all elements equal to A. */ 1243extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1244_mm256_set1_pd (double __A) 1245{ 1246 return __extension__ (__m256d){ __A, __A, __A, __A }; 1247} 1248 1249/* Create a vector with all elements equal to A. */ 1250extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1251_mm256_set1_ps (float __A) 1252{ 1253 return __extension__ (__m256){ __A, __A, __A, __A, 1254 __A, __A, __A, __A }; 1255} 1256 1257/* Create a vector with all elements equal to A. */ 1258extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1259_mm256_set1_epi32 (int __A) 1260{ 1261 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, 1262 __A, __A, __A, __A }; 1263} 1264 1265extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1266_mm256_set1_epi16 (short __A) 1267{ 1268 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, 1269 __A, __A, __A, __A, __A, __A, __A, __A); 1270} 1271 1272extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1273_mm256_set1_epi8 (char __A) 1274{ 1275 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 1276 __A, __A, __A, __A, __A, __A, __A, __A, 1277 __A, __A, __A, __A, __A, __A, __A, __A, 1278 __A, __A, __A, __A, __A, __A, __A, __A); 1279} 1280 1281extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282_mm256_set1_epi64x (long long __A) 1283{ 1284 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; 1285} 1286 1287/* Create vectors of elements in the reversed order from the 1288 _mm256_set_XXX functions. */ 1289 1290extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1291_mm256_setr_pd (double __A, double __B, double __C, double __D) 1292{ 1293 return _mm256_set_pd (__D, __C, __B, __A); 1294} 1295 1296extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297_mm256_setr_ps (float __A, float __B, float __C, float __D, 1298 float __E, float __F, float __G, float __H) 1299{ 1300 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); 1301} 1302 1303extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1304_mm256_setr_epi32 (int __A, int __B, int __C, int __D, 1305 int __E, int __F, int __G, int __H) 1306{ 1307 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); 1308} 1309 1310extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1311_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, 1312 short __q11, short __q10, short __q09, short __q08, 1313 short __q07, short __q06, short __q05, short __q04, 1314 short __q03, short __q02, short __q01, short __q00) 1315{ 1316 return _mm256_set_epi16 (__q00, __q01, __q02, __q03, 1317 __q04, __q05, __q06, __q07, 1318 __q08, __q09, __q10, __q11, 1319 __q12, __q13, __q14, __q15); 1320} 1321 1322extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1323_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, 1324 char __q27, char __q26, char __q25, char __q24, 1325 char __q23, char __q22, char __q21, char __q20, 1326 char __q19, char __q18, char __q17, char __q16, 1327 char __q15, char __q14, char __q13, char __q12, 1328 char __q11, char __q10, char __q09, char __q08, 1329 char __q07, char __q06, char __q05, char __q04, 1330 char __q03, char __q02, char __q01, char __q00) 1331{ 1332 return _mm256_set_epi8 (__q00, __q01, __q02, __q03, 1333 __q04, __q05, __q06, __q07, 1334 __q08, __q09, __q10, __q11, 1335 __q12, __q13, __q14, __q15, 1336 __q16, __q17, __q18, __q19, 1337 __q20, __q21, __q22, __q23, 1338 __q24, __q25, __q26, __q27, 1339 __q28, __q29, __q30, __q31); 1340} 1341 1342extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343_mm256_setr_epi64x (long long __A, long long __B, long long __C, 1344 long long __D) 1345{ 1346 return _mm256_set_epi64x (__D, __C, __B, __A); 1347} 1348 1349/* Casts between various SP, DP, INT vector types. Note that these do no 1350 conversion of values, they just change the type. */ 1351extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1352_mm256_castpd_ps (__m256d __A) 1353{ 1354 return (__m256) __A; 1355} 1356 1357extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1358_mm256_castpd_si256 (__m256d __A) 1359{ 1360 return (__m256i) __A; 1361} 1362 1363extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1364_mm256_castps_pd (__m256 __A) 1365{ 1366 return (__m256d) __A; 1367} 1368 1369extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370_mm256_castps_si256(__m256 __A) 1371{ 1372 return (__m256i) __A; 1373} 1374 1375extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1376_mm256_castsi256_ps (__m256i __A) 1377{ 1378 return (__m256) __A; 1379} 1380 1381extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1382_mm256_castsi256_pd (__m256i __A) 1383{ 1384 return (__m256d) __A; 1385} 1386 1387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1388_mm256_castpd256_pd128 (__m256d __A) 1389{ 1390 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); 1391} 1392 1393extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1394_mm256_castps256_ps128 (__m256 __A) 1395{ 1396 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); 1397} 1398 1399extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1400_mm256_castsi256_si128 (__m256i __A) 1401{ 1402 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); 1403} 1404 1405/* When cast is done from a 128 to 256-bit type, the low 128 bits of 1406 the 256-bit result contain source parameter value and the upper 128 1407 bits of the result are undefined. Those intrinsics shouldn't 1408 generate any extra moves. */ 1409 1410extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411_mm256_castpd128_pd256 (__m128d __A) 1412{ 1413 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); 1414} 1415 1416extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1417_mm256_castps128_ps256 (__m128 __A) 1418{ 1419 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); 1420} 1421 1422extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1423_mm256_castsi128_si256 (__m128i __A) 1424{ 1425 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); 1426} 1427