avxintrin.h revision 347208968c303a9c11fe29012f6dc49680465182
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151  return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157  return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
169  return (__m256d)((__v4di)a & (__v4di)b);
170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
175  return (__m256)((__v8si)a & (__v8si)b);
176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
181  return (__m256d)(~(__v4di)a & (__v4di)b);
182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
187  return (__m256)(~(__v8si)a & (__v8si)b);
188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
193  return (__m256d)((__v4di)a | (__v4di)b);
194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
199  return (__m256)((__v8si)a | (__v8si)b);
200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
205  return (__m256d)((__v4di)a ^ (__v4di)b);
206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
211  return (__m256)((__v8si)a ^ (__v8si)b);
212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262						  (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268  return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274  return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280  return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286  return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
289static __inline __m256d __attribute__((__always_inline__, __nodebug__))
290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
291{
292  return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
293}
294
295static __inline __m256 __attribute__((__always_inline__, __nodebug__))
296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
297{
298  return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
299}
300
301static __inline __m256i __attribute__((__always_inline__, __nodebug__))
302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
303{
304  return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
305}
306
307/* Vector Blend */
308#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
309  __m256d __V1 = (V1); \
310  __m256d __V2 = (V2); \
311  (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
312
313#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
314  __m256 __V1 = (V1); \
315  __m256 __V2 = (V2); \
316  (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
317
318static __inline __m256d __attribute__((__always_inline__, __nodebug__))
319_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
320{
321  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
322}
323
324static __inline __m256 __attribute__((__always_inline__, __nodebug__))
325_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
326{
327  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
328}
329
330/* Vector Dot Product */
331#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
332  __m256 __V1 = (V1); \
333  __m256 __V2 = (V2); \
334  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
335
336/* Vector shuffle */
337#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
338        __m256 __a = (a); \
339        __m256 __b = (b); \
340        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
341        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
342        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
343        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
344        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
345
346#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
347        __m256d __a = (a); \
348        __m256d __b = (b); \
349        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
350        (mask) & 0x1, \
351        (((mask) & 0x2) >> 1) + 4, \
352        (((mask) & 0x4) >> 2) + 2, \
353        (((mask) & 0x8) >> 3) + 6); })
354
355/* Compare */
356#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
357#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
358#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
359#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
360#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
361#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
362#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
363#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
364#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
365#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
366#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
367#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
368#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
369#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
370#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
371#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
372#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
373#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
374#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
375#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
376#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
377#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
378#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
379#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
380#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
381#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
382#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
383#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
384#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
385#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
386#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
387#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
388
389#define _mm_cmp_pd(a, b, c) __extension__ ({ \
390  __m128d __a = (a); \
391  __m128d __b = (b); \
392  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
393
394#define _mm_cmp_ps(a, b, c) __extension__ ({ \
395  __m128 __a = (a); \
396  __m128 __b = (b); \
397  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
398
399#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
400  __m256d __a = (a); \
401  __m256d __b = (b); \
402  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
403
404#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
405  __m256 __a = (a); \
406  __m256 __b = (b); \
407  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
408
409#define _mm_cmp_sd(a, b, c) __extension__ ({ \
410  __m128d __a = (a); \
411  __m128d __b = (b); \
412  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
413
414#define _mm_cmp_ss(a, b, c) __extension__ ({ \
415  __m128 __a = (a); \
416  __m128 __b = (b); \
417  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
418
419/* Vector extract */
420static __inline __m128d __attribute__((__always_inline__, __nodebug__))
421_mm256_extractf128_pd(__m256d a, const int o)
422{
423  return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
424}
425
426static __inline __m128 __attribute__((__always_inline__, __nodebug__))
427_mm256_extractf128_ps(__m256 a, const int o)
428{
429  return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
430}
431
432static __inline __m128i __attribute__((__always_inline__, __nodebug__))
433_mm256_extractf128_si256(__m256i a, const int o)
434{
435  return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
436}
437
438static __inline int __attribute__((__always_inline__, __nodebug__))
439_mm256_extract_epi32(__m256i a, int const imm)
440{
441  __v8si b = (__v8si)a;
442  return b[imm];
443}
444
445static __inline int __attribute__((__always_inline__, __nodebug__))
446_mm256_extract_epi16(__m256i a, int const imm)
447{
448  __v16hi b = (__v16hi)a;
449  return b[imm];
450}
451
452static __inline int __attribute__((__always_inline__, __nodebug__))
453_mm256_extract_epi8(__m256i a, int const imm)
454{
455  __v32qi b = (__v32qi)a;
456  return b[imm];
457}
458
459#ifdef __x86_64__
460static __inline long long  __attribute__((__always_inline__, __nodebug__))
461_mm256_extract_epi64(__m256i a, const int imm)
462{
463  __v4di b = (__v4di)a;
464  return b[imm];
465}
466#endif
467
468/* Vector insert */
469static __inline __m256d __attribute__((__always_inline__, __nodebug__))
470_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
471{
472  return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
473}
474
475static __inline __m256 __attribute__((__always_inline__, __nodebug__))
476_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
477{
478  return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
479}
480
481static __inline __m256i __attribute__((__always_inline__, __nodebug__))
482_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
483{
484  return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
485}
486
487static __inline __m256i __attribute__((__always_inline__, __nodebug__))
488_mm256_insert_epi32(__m256i a, int b, int const imm)
489{
490  __v8si c = (__v8si)a;
491  c[imm & 7] = b;
492  return (__m256i)c;
493}
494
495static __inline __m256i __attribute__((__always_inline__, __nodebug__))
496_mm256_insert_epi16(__m256i a, int b, int const imm)
497{
498  __v16hi c = (__v16hi)a;
499  c[imm & 15] = b;
500  return (__m256i)c;
501}
502
503static __inline __m256i __attribute__((__always_inline__, __nodebug__))
504_mm256_insert_epi8(__m256i a, int b, int const imm)
505{
506  __v32qi c = (__v32qi)a;
507  c[imm & 31] = b;
508  return (__m256i)c;
509}
510
511#ifdef __x86_64__
512static __inline __m256i __attribute__((__always_inline__, __nodebug__))
513_mm256_insert_epi64(__m256i a, int b, int const imm)
514{
515  __v4di c = (__v4di)a;
516  c[imm & 3] = b;
517  return (__m256i)c;
518}
519#endif
520
521/* Conversion */
522static __inline __m256d __attribute__((__always_inline__, __nodebug__))
523_mm256_cvtepi32_pd(__m128i a)
524{
525  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
526}
527
528static __inline __m256 __attribute__((__always_inline__, __nodebug__))
529_mm256_cvtepi32_ps(__m256i a)
530{
531  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
532}
533
534static __inline __m128 __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtpd_ps(__m256d a)
536{
537  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
538}
539
540static __inline __m256i __attribute__((__always_inline__, __nodebug__))
541_mm256_cvtps_epi32(__m256 a)
542{
543  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
544}
545
546static __inline __m256d __attribute__((__always_inline__, __nodebug__))
547_mm256_cvtps_pd(__m128 a)
548{
549  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
550}
551
552static __inline __m128i __attribute__((__always_inline__, __nodebug__))
553_mm256_cvttpd_epi32(__m256d a)
554{
555  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
556}
557
558static __inline __m128i __attribute__((__always_inline__, __nodebug__))
559_mm256_cvtpd_epi32(__m256d a)
560{
561  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
562}
563
564static __inline __m256i __attribute__((__always_inline__, __nodebug__))
565_mm256_cvttps_epi32(__m256 a)
566{
567  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
568}
569
570/* Vector replicate */
571static __inline __m256 __attribute__((__always_inline__, __nodebug__))
572_mm256_movehdup_ps(__m256 a)
573{
574  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
575}
576
577static __inline __m256 __attribute__((__always_inline__, __nodebug__))
578_mm256_moveldup_ps(__m256 a)
579{
580  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
581}
582
583static __inline __m256d __attribute__((__always_inline__, __nodebug__))
584_mm256_movedup_pd(__m256d a)
585{
586  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
587}
588
589/* Unpack and Interleave */
590static __inline __m256d __attribute__((__always_inline__, __nodebug__))
591_mm256_unpackhi_pd(__m256d a, __m256d b)
592{
593  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
594}
595
596static __inline __m256d __attribute__((__always_inline__, __nodebug__))
597_mm256_unpacklo_pd(__m256d a, __m256d b)
598{
599  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
600}
601
602static __inline __m256 __attribute__((__always_inline__, __nodebug__))
603_mm256_unpackhi_ps(__m256 a, __m256 b)
604{
605  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
606}
607
608static __inline __m256 __attribute__((__always_inline__, __nodebug__))
609_mm256_unpacklo_ps(__m256 a, __m256 b)
610{
611  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
612}
613
614/* Bit Test */
615static __inline int __attribute__((__always_inline__, __nodebug__))
616_mm_testz_pd(__m128d a, __m128d b)
617{
618  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testc_pd(__m128d a, __m128d b)
623{
624  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm_testnzc_pd(__m128d a, __m128d b)
629{
630  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm_testz_ps(__m128 a, __m128 b)
635{
636  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm_testc_ps(__m128 a, __m128 b)
641{
642  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm_testnzc_ps(__m128 a, __m128 b)
647{
648  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm256_testz_pd(__m256d a, __m256d b)
653{
654  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testc_pd(__m256d a, __m256d b)
659{
660  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testnzc_pd(__m256d a, __m256d b)
665{
666  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testz_ps(__m256 a, __m256 b)
671{
672  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testc_ps(__m256 a, __m256 b)
677{
678  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
679}
680
681static __inline int __attribute__((__always_inline__, __nodebug__))
682_mm256_testnzc_ps(__m256 a, __m256 b)
683{
684  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
685}
686
687static __inline int __attribute__((__always_inline__, __nodebug__))
688_mm256_testz_si256(__m256i a, __m256i b)
689{
690  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
691}
692
693static __inline int __attribute__((__always_inline__, __nodebug__))
694_mm256_testc_si256(__m256i a, __m256i b)
695{
696  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
697}
698
699static __inline int __attribute__((__always_inline__, __nodebug__))
700_mm256_testnzc_si256(__m256i a, __m256i b)
701{
702  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
703}
704
705/* Vector extract sign mask */
706static __inline int __attribute__((__always_inline__, __nodebug__))
707_mm256_movemask_pd(__m256d a)
708{
709  return __builtin_ia32_movmskpd256((__v4df)a);
710}
711
712static __inline int __attribute__((__always_inline__, __nodebug__))
713_mm256_movemask_ps(__m256 a)
714{
715  return __builtin_ia32_movmskps256((__v8sf)a);
716}
717
718/* Vector zero */
719static __inline void __attribute__((__always_inline__, __nodebug__))
720_mm256_zeroall(void)
721{
722  __builtin_ia32_vzeroall();
723}
724
725static __inline void __attribute__((__always_inline__, __nodebug__))
726_mm256_zeroupper(void)
727{
728  __builtin_ia32_vzeroupper();
729}
730
731/* Vector load with broadcast */
732static __inline __m128 __attribute__((__always_inline__, __nodebug__))
733_mm_broadcast_ss(float const *a)
734{
735  return (__m128)__builtin_ia32_vbroadcastss(a);
736}
737
738static __inline __m256d __attribute__((__always_inline__, __nodebug__))
739_mm256_broadcast_sd(double const *a)
740{
741  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
742}
743
744static __inline __m256 __attribute__((__always_inline__, __nodebug__))
745_mm256_broadcast_ss(float const *a)
746{
747  return (__m256)__builtin_ia32_vbroadcastss256(a);
748}
749
750static __inline __m256d __attribute__((__always_inline__, __nodebug__))
751_mm256_broadcast_pd(__m128d const *a)
752{
753  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
754}
755
756static __inline __m256 __attribute__((__always_inline__, __nodebug__))
757_mm256_broadcast_ps(__m128 const *a)
758{
759  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
760}
761
762/* SIMD load ops */
763static __inline __m256d __attribute__((__always_inline__, __nodebug__))
764_mm256_load_pd(double const *p)
765{
766  return *(__m256d *)p;
767}
768
769static __inline __m256 __attribute__((__always_inline__, __nodebug__))
770_mm256_load_ps(float const *p)
771{
772  return *(__m256 *)p;
773}
774
775static __inline __m256d __attribute__((__always_inline__, __nodebug__))
776_mm256_loadu_pd(double const *p)
777{
778  return (__m256d)__builtin_ia32_loadupd256(p);
779}
780
781static __inline __m256 __attribute__((__always_inline__, __nodebug__))
782_mm256_loadu_ps(float const *p)
783{
784  return (__m256)__builtin_ia32_loadups256(p);
785}
786
787static __inline __m256i __attribute__((__always_inline__, __nodebug__))
788_mm256_load_si256(__m256i const *p)
789{
790  return *p;
791}
792
793static __inline __m256i __attribute__((__always_inline__, __nodebug__))
794_mm256_loadu_si256(__m256i const *p)
795{
796  return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
797}
798
799static __inline __m256i __attribute__((__always_inline__, __nodebug__))
800_mm256_lddqu_si256(__m256i const *p)
801{
802  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
803}
804
805/* SIMD store ops */
806static __inline void __attribute__((__always_inline__, __nodebug__))
807_mm256_store_pd(double *p, __m256d a)
808{
809  *(__m256d *)p = a;
810}
811
812static __inline void __attribute__((__always_inline__, __nodebug__))
813_mm256_store_ps(float *p, __m256 a)
814{
815  *(__m256 *)p = a;
816}
817
818static __inline void __attribute__((__always_inline__, __nodebug__))
819_mm256_storeu_pd(double *p, __m256d a)
820{
821  __builtin_ia32_storeupd256(p, (__v4df)a);
822}
823
824static __inline void __attribute__((__always_inline__, __nodebug__))
825_mm256_storeu_ps(float *p, __m256 a)
826{
827  __builtin_ia32_storeups256(p, (__v8sf)a);
828}
829
830static __inline void __attribute__((__always_inline__, __nodebug__))
831_mm256_store_si256(__m256i *p, __m256i a)
832{
833  *p = a;
834}
835
836static __inline void __attribute__((__always_inline__, __nodebug__))
837_mm256_storeu_si256(__m256i *p, __m256i a)
838{
839  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
840}
841
842/* Conditional load ops */
843static __inline __m128d __attribute__((__always_inline__, __nodebug__))
844_mm_maskload_pd(double const *p, __m128d m)
845{
846  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
847}
848
849static __inline __m256d __attribute__((__always_inline__, __nodebug__))
850_mm256_maskload_pd(double const *p, __m256d m)
851{
852  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
853}
854
855static __inline __m128 __attribute__((__always_inline__, __nodebug__))
856_mm_maskload_ps(float const *p, __m128 m)
857{
858  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
859}
860
861static __inline __m256 __attribute__((__always_inline__, __nodebug__))
862_mm256_maskload_ps(float const *p, __m256 m)
863{
864  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
865}
866
867/* Conditional store ops */
868static __inline void __attribute__((__always_inline__, __nodebug__))
869_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
870{
871  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
872}
873
874static __inline void __attribute__((__always_inline__, __nodebug__))
875_mm_maskstore_pd(double *p, __m128d m, __m128d a)
876{
877  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
878}
879
880static __inline void __attribute__((__always_inline__, __nodebug__))
881_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
882{
883  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
884}
885
886static __inline void __attribute__((__always_inline__, __nodebug__))
887_mm_maskstore_ps(float *p, __m128 m, __m128 a)
888{
889  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
890}
891
892/* Cacheability support ops */
893static __inline void __attribute__((__always_inline__, __nodebug__))
894_mm256_stream_si256(__m256i *a, __m256i b)
895{
896  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
897}
898
899static __inline void __attribute__((__always_inline__, __nodebug__))
900_mm256_stream_pd(double *a, __m256d b)
901{
902  __builtin_ia32_movntpd256(a, (__v4df)b);
903}
904
905static __inline void __attribute__((__always_inline__, __nodebug__))
906_mm256_stream_ps(float *p, __m256 a)
907{
908  __builtin_ia32_movntps256(p, (__v8sf)a);
909}
910
911/* Create vectors */
912static __inline __m256d __attribute__((__always_inline__, __nodebug__))
913_mm256_set_pd(double a, double b, double c, double d)
914{
915  return (__m256d){ d, c, b, a };
916}
917
918static __inline __m256 __attribute__((__always_inline__, __nodebug__))
919_mm256_set_ps(float a, float b, float c, float d,
920	            float e, float f, float g, float h)
921{
922  return (__m256){ h, g, f, e, d, c, b, a };
923}
924
925static __inline __m256i __attribute__((__always_inline__, __nodebug__))
926_mm256_set_epi32(int i0, int i1, int i2, int i3,
927		             int i4, int i5, int i6, int i7)
928{
929  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
930}
931
932static __inline __m256i __attribute__((__always_inline__, __nodebug__))
933_mm256_set_epi16(short w15, short w14, short w13, short w12,
934		             short w11, short w10, short w09, short w08,
935		             short w07, short w06, short w05, short w04,
936		             short w03, short w02, short w01, short w00)
937{
938  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
939                             w08, w09, w10, w11, w12, w13, w14, w15 };
940}
941
942static __inline __m256i __attribute__((__always_inline__, __nodebug__))
943_mm256_set_epi8(char b31, char b30, char b29, char b28,
944		            char b27, char b26, char b25, char b24,
945		            char b23, char b22, char b21, char b20,
946		            char b19, char b18, char b17, char b16,
947		            char b15, char b14, char b13, char b12,
948		            char b11, char b10, char b09, char b08,
949		            char b07, char b06, char b05, char b04,
950		            char b03, char b02, char b01, char b00)
951{
952  return (__m256i)(__v32qi){
953    b00, b01, b02, b03, b04, b05, b06, b07,
954    b08, b09, b10, b11, b12, b13, b14, b15,
955    b16, b17, b18, b19, b20, b21, b22, b23,
956    b24, b25, b26, b27, b28, b29, b30, b31
957  };
958}
959
960static __inline __m256i __attribute__((__always_inline__, __nodebug__))
961_mm256_set_epi64x(long long a, long long b, long long c, long long d)
962{
963  return (__m256i)(__v4di){ d, c, b, a };
964}
965
966/* Create vectors with elements in reverse order */
967static __inline __m256d __attribute__((__always_inline__, __nodebug__))
968_mm256_setr_pd(double a, double b, double c, double d)
969{
970  return (__m256d){ a, b, c, d };
971}
972
973static __inline __m256 __attribute__((__always_inline__, __nodebug__))
974_mm256_setr_ps(float a, float b, float c, float d,
975		           float e, float f, float g, float h)
976{
977  return (__m256){ a, b, c, d, e, f, g, h };
978}
979
980static __inline __m256i __attribute__((__always_inline__, __nodebug__))
981_mm256_setr_epi32(int i0, int i1, int i2, int i3,
982		              int i4, int i5, int i6, int i7)
983{
984  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
985}
986
987static __inline __m256i __attribute__((__always_inline__, __nodebug__))
988_mm256_setr_epi16(short w15, short w14, short w13, short w12,
989		   short w11, short w10, short w09, short w08,
990		   short w07, short w06, short w05, short w04,
991		   short w03, short w02, short w01, short w00)
992{
993  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
994			                       w07, w06, w05, w04, w03, w02, w01, w00 };
995}
996
997static __inline __m256i __attribute__((__always_inline__, __nodebug__))
998_mm256_setr_epi8(char b31, char b30, char b29, char b28,
999		             char b27, char b26, char b25, char b24,
1000		             char b23, char b22, char b21, char b20,
1001		             char b19, char b18, char b17, char b16,
1002		             char b15, char b14, char b13, char b12,
1003		             char b11, char b10, char b09, char b08,
1004		             char b07, char b06, char b05, char b04,
1005		             char b03, char b02, char b01, char b00)
1006{
1007  return (__m256i)(__v32qi){
1008    b31, b30, b29, b28, b27, b26, b25, b24,
1009		b23, b22, b21, b20, b19, b18, b17, b16,
1010		b15, b14, b13, b12, b11, b10, b09, b08,
1011		b07, b06, b05, b04, b03, b02, b01, b00 };
1012}
1013
1014static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1015_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1016{
1017  return (__m256i)(__v4di){ a, b, c, d };
1018}
1019
1020/* Create vectors with repeated elements */
1021static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1022_mm256_set1_pd(double w)
1023{
1024  return (__m256d){ w, w, w, w };
1025}
1026
1027static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1028_mm256_set1_ps(float w)
1029{
1030  return (__m256){ w, w, w, w, w, w, w, w };
1031}
1032
1033static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1034_mm256_set1_epi32(int i)
1035{
1036  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1037}
1038
1039static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1040_mm256_set1_epi16(short w)
1041{
1042  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1043}
1044
1045static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1046_mm256_set1_epi8(char b)
1047{
1048  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1049                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1050}
1051
1052static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1053_mm256_set1_epi64x(long long q)
1054{
1055  return (__m256i)(__v4di){ q, q, q, q };
1056}
1057
1058/* Create zeroed vectors */
1059static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1060_mm256_setzero_pd(void)
1061{
1062  return (__m256d){ 0, 0, 0, 0 };
1063}
1064
1065static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1066_mm256_setzero_ps(void)
1067{
1068  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1069}
1070
1071static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1072_mm256_setzero_si256(void)
1073{
1074  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1075}
1076
1077/* Cast between vector types */
1078static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1079_mm256_castpd_ps(__m256d in)
1080{
1081  return (__m256)in;
1082}
1083
1084static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1085_mm256_castpd_si256(__m256d in)
1086{
1087  return (__m256i)in;
1088}
1089
1090static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1091_mm256_castps_pd(__m256 in)
1092{
1093  return (__m256d)in;
1094}
1095
1096static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1097_mm256_castps_si256(__m256 in)
1098{
1099  return (__m256i)in;
1100}
1101
1102static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1103_mm256_castsi256_ps(__m256i in)
1104{
1105  return (__m256)in;
1106}
1107
1108static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1109_mm256_castsi256_pd(__m256i in)
1110{
1111  return (__m256d)in;
1112}
1113
1114static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1115_mm256_castpd256_pd128(__m256d in)
1116{
1117  return __builtin_shufflevector(in, in, 0, 1);
1118}
1119
1120static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1121_mm256_castps256_ps128(__m256 in)
1122{
1123  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1124}
1125
1126static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1127_mm256_castsi256_si128(__m256i in)
1128{
1129  return __builtin_shufflevector(in, in, 0, 1);
1130}
1131
1132static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1133_mm256_castpd128_pd256(__m128d in)
1134{
1135  __m128d zero = _mm_setzero_pd();
1136  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1137}
1138
1139static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1140_mm256_castps128_ps256(__m128 in)
1141{
1142  __m128 zero = _mm_setzero_ps();
1143  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1144}
1145
1146static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1147_mm256_castsi128_si256(__m128i in)
1148{
1149  __m128i zero = _mm_setzero_si128();
1150  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1151}
1152