1/*===---- xopintrin.h - FMA4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __X86INTRIN_H 25#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead." 26#endif 27 28#ifndef __XOPINTRIN_H 29#define __XOPINTRIN_H 30 31#ifndef __XOP__ 32# error "XOP instruction set is not enabled" 33#else 34 35#include <fma4intrin.h> 36 37static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 38_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) 39{ 40 return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); 41} 42 43static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 44_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) 45{ 46 return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); 47} 48 49static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 50_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) 51{ 52 return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 53} 54 55static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 56_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) 57{ 58 return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 59} 60 61static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 62_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) 63{ 64 return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C); 65} 66 67static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 68_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) 69{ 70 return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C); 71} 72 73static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 74_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) 75{ 76 return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C); 77} 78 79static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 80_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) 81{ 82 return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C); 83} 84 85static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 86_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) 87{ 88 return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); 89} 90 91static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 92_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) 93{ 94 return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); 95} 96 97static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 98_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) 99{ 100 return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 101} 102 103static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 104_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) 105{ 106 return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 107} 108 109static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 110_mm_haddw_epi8(__m128i __A) 111{ 112 return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); 113} 114 115static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 116_mm_haddd_epi8(__m128i __A) 117{ 118 return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); 119} 120 121static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 122_mm_haddq_epi8(__m128i __A) 123{ 124 return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); 125} 126 127static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 128_mm_haddd_epi16(__m128i __A) 129{ 130 return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); 131} 132 133static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 134_mm_haddq_epi16(__m128i __A) 135{ 136 return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); 137} 138 139static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 140_mm_haddq_epi32(__m128i __A) 141{ 142 return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); 143} 144 145static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 146_mm_haddw_epu8(__m128i __A) 147{ 148 return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); 149} 150 151static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 152_mm_haddd_epu8(__m128i __A) 153{ 154 return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); 155} 156 157static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 158_mm_haddq_epu8(__m128i __A) 159{ 160 return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); 161} 162 163static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 164_mm_haddd_epu16(__m128i __A) 165{ 166 return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); 167} 168 169static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 170_mm_haddq_epu16(__m128i __A) 171{ 172 return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); 173} 174 175static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 176_mm_haddq_epu32(__m128i __A) 177{ 178 return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); 179} 180 181static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 182_mm_hsubw_epi8(__m128i __A) 183{ 184 return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); 185} 186 187static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 188_mm_hsubd_epi16(__m128i __A) 189{ 190 return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); 191} 192 193static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 194_mm_hsubq_epi32(__m128i __A) 195{ 196 return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); 197} 198 199static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 200_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) 201{ 202 return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C); 203} 204 205static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 206_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) 207{ 208 return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C); 209} 210 211static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 212_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) 213{ 214 return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); 215} 216 217static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 218_mm_rot_epi8(__m128i __A, __m128i __B) 219{ 220 return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); 221} 222 223static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 224_mm_rot_epi16(__m128i __A, __m128i __B) 225{ 226 return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); 227} 228 229static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 230_mm_rot_epi32(__m128i __A, __m128i __B) 231{ 232 return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); 233} 234 235static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 236_mm_rot_epi64(__m128i __A, __m128i __B) 237{ 238 return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); 239} 240 241#define _mm_roti_epi8(A, N) __extension__ ({ \ 242 __m128i __A = (A); \ 243 (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); }) 244 245#define _mm_roti_epi16(A, N) __extension__ ({ \ 246 __m128i __A = (A); \ 247 (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); }) 248 249#define _mm_roti_epi32(A, N) __extension__ ({ \ 250 __m128i __A = (A); \ 251 (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); }) 252 253#define _mm_roti_epi64(A, N) __extension__ ({ \ 254 __m128i __A = (A); \ 255 (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); }) 256 257static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 258_mm_shl_epi8(__m128i __A, __m128i __B) 259{ 260 return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); 261} 262 263static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 264_mm_shl_epi16(__m128i __A, __m128i __B) 265{ 266 return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); 267} 268 269static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 270_mm_shl_epi32(__m128i __A, __m128i __B) 271{ 272 return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); 273} 274 275static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 276_mm_shl_epi64(__m128i __A, __m128i __B) 277{ 278 return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); 279} 280 281static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 282_mm_sha_epi8(__m128i __A, __m128i __B) 283{ 284 return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); 285} 286 287static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 288_mm_sha_epi16(__m128i __A, __m128i __B) 289{ 290 return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); 291} 292 293static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 294_mm_sha_epi32(__m128i __A, __m128i __B) 295{ 296 return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); 297} 298 299static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 300_mm_sha_epi64(__m128i __A, __m128i __B) 301{ 302 return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); 303} 304 305#define _mm_com_epu8(A, B, N) __extension__ ({ \ 306 __m128i __A = (A); \ 307 __m128i __B = (B); \ 308 (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); }) 309 310#define _mm_com_epu16(A, B, N) __extension__ ({ \ 311 __m128i __A = (A); \ 312 __m128i __B = (B); \ 313 (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); }) 314 315#define _mm_com_epu32(A, B, N) __extension__ ({ \ 316 __m128i __A = (A); \ 317 __m128i __B = (B); \ 318 (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); }) 319 320#define _mm_com_epu64(A, B, N) __extension__ ({ \ 321 __m128i __A = (A); \ 322 __m128i __B = (B); \ 323 (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); }) 324 325#define _mm_com_epi8(A, B, N) __extension__ ({ \ 326 __m128i __A = (A); \ 327 __m128i __B = (B); \ 328 (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); }) 329 330#define _mm_com_epi16(A, B, N) __extension__ ({ \ 331 __m128i __A = (A); \ 332 __m128i __B = (B); \ 333 (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); }) 334 335#define _mm_com_epi32(A, B, N) __extension__ ({ \ 336 __m128i __A = (A); \ 337 __m128i __B = (B); \ 338 (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); }) 339 340#define _mm_com_epi64(A, B, N) __extension__ ({ \ 341 __m128i __A = (A); \ 342 __m128i __B = (B); \ 343 (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); }) 344 345#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \ 346 __m128d __X = (X); \ 347 __m128d __Y = (Y); \ 348 __m128i __C = (C); \ 349 (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \ 350 (__v2di)__C, (I)); }) 351 352#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \ 353 __m256d __X = (X); \ 354 __m256d __Y = (Y); \ 355 __m256i __C = (C); \ 356 (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \ 357 (__v4di)__C, (I)); }) 358 359#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \ 360 __m128 __X = (X); \ 361 __m128 __Y = (Y); \ 362 __m128i __C = (C); \ 363 (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \ 364 (__v4si)__C, (I)); }) 365 366#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \ 367 __m256 __X = (X); \ 368 __m256 __Y = (Y); \ 369 __m256i __C = (C); \ 370 (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \ 371 (__v8si)__C, (I)); }) 372 373static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 374_mm_frcz_ss(__m128 __A) 375{ 376 return (__m128)__builtin_ia32_vfrczss((__v4sf)__A); 377} 378 379static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 380_mm_frcz_sd(__m128d __A) 381{ 382 return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A); 383} 384 385static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 386_mm_frcz_ps(__m128 __A) 387{ 388 return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); 389} 390 391static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 392_mm_frcz_pd(__m128d __A) 393{ 394 return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); 395} 396 397static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) 398_mm256_frcz_ps(__m256 __A) 399{ 400 return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); 401} 402 403static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) 404_mm256_frcz_pd(__m256d __A) 405{ 406 return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); 407} 408 409#endif /* __XOP__ */ 410 411#endif /* __XOPINTRIN_H */ 412