avxintrin.h revision 7fc3702694996d7d373e3280812a4172cf451aac
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __AVXINTRIN_H
25#define __AVXINTRIN_H
26
27#ifndef __AVX__
28#error "AVX instruction set not enabled"
29#else
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38typedef float __m256 __attribute__ ((__vector_size__ (32)));
39typedef double __m256d __attribute__((__vector_size__(32)));
40typedef long long __m256i __attribute__((__vector_size__(32)));
41
42/* Arithmetic */
43static __inline __m256d __attribute__((__always_inline__, __nodebug__))
44_mm256_add_pd(__m256d a, __m256d b)
45{
46  return a+b;
47}
48
49static __inline __m256 __attribute__((__always_inline__, __nodebug__))
50_mm256_add_ps(__m256 a, __m256 b)
51{
52  return a+b;
53}
54
55static __inline __m256d __attribute__((__always_inline__, __nodebug__))
56_mm256_sub_pd(__m256d a, __m256d b)
57{
58  return a-b;
59}
60
61static __inline __m256 __attribute__((__always_inline__, __nodebug__))
62_mm256_sub_ps(__m256 a, __m256 b)
63{
64  return a-b;
65}
66
67static __inline __m256d __attribute__((__always_inline__, __nodebug__))
68_mm256_addsub_pd(__m256d a, __m256d b)
69{
70  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
71}
72
73static __inline __m256 __attribute__((__always_inline__, __nodebug__))
74_mm256_addsub_ps(__m256 a, __m256 b)
75{
76  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
77}
78
79static __inline __m256d __attribute__((__always_inline__, __nodebug__))
80_mm256_div_pd(__m256d a, __m256d b)
81{
82  return a / b;
83}
84
85static __inline __m256 __attribute__((__always_inline__, __nodebug__))
86_mm256_div_ps(__m256 a, __m256 b)
87{
88  return a / b;
89}
90
91static __inline __m256d __attribute__((__always_inline__, __nodebug__))
92_mm256_max_pd(__m256d a, __m256d b)
93{
94  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
95}
96
97static __inline __m256 __attribute__((__always_inline__, __nodebug__))
98_mm256_max_ps(__m256 a, __m256 b)
99{
100  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
101}
102
103static __inline __m256d __attribute__((__always_inline__, __nodebug__))
104_mm256_min_pd(__m256d a, __m256d b)
105{
106  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
107}
108
109static __inline __m256 __attribute__((__always_inline__, __nodebug__))
110_mm256_min_ps(__m256 a, __m256 b)
111{
112  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
113}
114
115static __inline __m256d __attribute__((__always_inline__, __nodebug__))
116_mm256_mul_pd(__m256d a, __m256d b)
117{
118  return a * b;
119}
120
121static __inline __m256 __attribute__((__always_inline__, __nodebug__))
122_mm256_mul_ps(__m256 a, __m256 b)
123{
124  return a * b;
125}
126
127static __inline __m256d __attribute__((__always_inline__, __nodebug__))
128_mm256_sqrt_pd(__m256d a)
129{
130  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
131}
132
133static __inline __m256 __attribute__((__always_inline__, __nodebug__))
134_mm256_sqrt_ps(__m256 a)
135{
136  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
137}
138
139static __inline __m256 __attribute__((__always_inline__, __nodebug__))
140_mm256_rsqrt_ps(__m256 a)
141{
142  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
143}
144
145static __inline __m256 __attribute__((__always_inline__, __nodebug__))
146_mm256_rcp_ps(__m256 a)
147{
148  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
149}
150
151static __inline __m256d __attribute__((__always_inline__, __nodebug__))
152_mm256_round_pd(__m256d v, const int m)
153{
154  return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
155}
156
157static __inline __m256 __attribute__((__always_inline__, __nodebug__))
158_mm256_round_ps(__m256 v, const int m)
159{
160  return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
161}
162
163#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
164#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
165#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
166#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
167
168/* Logical */
169static __inline __m256d __attribute__((__always_inline__, __nodebug__))
170_mm256_and_pd(__m256d a, __m256d b)
171{
172  return (__m256d)((__v4di)a & (__v4di)b);
173}
174
175static __inline __m256 __attribute__((__always_inline__, __nodebug__))
176_mm256_and_ps(__m256 a, __m256 b)
177{
178  return (__m256)((__v8si)a & (__v8si)b);
179}
180
181static __inline __m256d __attribute__((__always_inline__, __nodebug__))
182_mm256_andnot_pd(__m256d a, __m256d b)
183{
184  return (__m256d)(~(__v4di)a & (__v4di)b);
185}
186
187static __inline __m256 __attribute__((__always_inline__, __nodebug__))
188_mm256_andnot_ps(__m256 a, __m256 b)
189{
190  return (__m256)(~(__v8si)a & (__v8si)b);
191}
192
193static __inline __m256d __attribute__((__always_inline__, __nodebug__))
194_mm256_or_pd(__m256d a, __m256d b)
195{
196  return (__m256d)((__v4di)a | (__v4di)b);
197}
198
199static __inline __m256 __attribute__((__always_inline__, __nodebug__))
200_mm256_or_ps(__m256 a, __m256 b)
201{
202  return (__m256)((__v8si)a | (__v8si)b);
203}
204
205static __inline __m256d __attribute__((__always_inline__, __nodebug__))
206_mm256_xor_pd(__m256d a, __m256d b)
207{
208  return (__m256d)((__v4di)a ^ (__v4di)b);
209}
210
211static __inline __m256 __attribute__((__always_inline__, __nodebug__))
212_mm256_xor_ps(__m256 a, __m256 b)
213{
214  return (__m256)((__v8si)a ^ (__v8si)b);
215}
216
217/* Horizontal arithmetic */
218static __inline __m256d __attribute__((__always_inline__, __nodebug__))
219_mm256_hadd_pd(__m256d a, __m256d b)
220{
221  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
222}
223
224static __inline __m256 __attribute__((__always_inline__, __nodebug__))
225_mm256_hadd_ps(__m256 a, __m256 b)
226{
227  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
228}
229
230static __inline __m256d __attribute__((__always_inline__, __nodebug__))
231_mm256_hsub_pd(__m256d a, __m256d b)
232{
233  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
234}
235
236static __inline __m256 __attribute__((__always_inline__, __nodebug__))
237_mm256_hsub_ps(__m256 a, __m256 b)
238{
239  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
240}
241
242/* Vector permutations */
243static __inline __m128d __attribute__((__always_inline__, __nodebug__))
244_mm_permutevar_pd(__m128d a, __m128i c)
245{
246  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
247}
248
249static __inline __m256d __attribute__((__always_inline__, __nodebug__))
250_mm256_permutevar_pd(__m256d a, __m256i c)
251{
252  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
253}
254
255static __inline __m128 __attribute__((__always_inline__, __nodebug__))
256_mm_permutevar_ps(__m128 a, __m128i c)
257{
258  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
259}
260
261static __inline __m256 __attribute__((__always_inline__, __nodebug__))
262_mm256_permutevar_ps(__m256 a, __m256i c)
263{
264  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
265						  (__v8si)c);
266}
267
268static __inline __m128d __attribute__((__always_inline__, __nodebug__))
269_mm_permute_pd(__m128d a, const int c)
270{
271  return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
272}
273
274static __inline __m256d __attribute__((__always_inline__, __nodebug__))
275_mm256_permute_pd(__m256d a, const int c)
276{
277  return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
278}
279
280static __inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_permute_ps(__m128 a, const int c)
282{
283  return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
284}
285
286static __inline __m256 __attribute__((__always_inline__, __nodebug__))
287_mm256_permute_ps(__m256 a, const int c)
288{
289  return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
290}
291
292static __inline __m256d __attribute__((__always_inline__, __nodebug__))
293_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
294{
295  return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
296}
297
298static __inline __m256 __attribute__((__always_inline__, __nodebug__))
299_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
300{
301  return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
302}
303
304static __inline __m256i __attribute__((__always_inline__, __nodebug__))
305_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
306{
307  return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
308}
309
310/* Vector Blend */
311static __inline __m256d __attribute__((__always_inline__, __nodebug__))
312_mm256_blend_pd(__m256d a, __m256d b, const int c)
313{
314  return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
315}
316
317static __inline __m256 __attribute__((__always_inline__, __nodebug__))
318_mm256_blend_ps(__m256 a, __m256 b, const int c)
319{
320  return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
321}
322
323static __inline __m256d __attribute__((__always_inline__, __nodebug__))
324_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
325{
326  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
327}
328
329static __inline __m256 __attribute__((__always_inline__, __nodebug__))
330_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
331{
332  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
333}
334
335/* Vector Dot Product */
336static __inline __m256 __attribute__((__always_inline__, __nodebug__))
337_mm256_dp_ps(__m256 a, __m256 b, const int c)
338{
339  return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
340}
341
342/* Vector shuffle */
343#define _mm256_shuffle_ps(a, b, mask) \
344        (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
345        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
346        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8 \
347        (mask) & 0x3 + 4,            (((mask) & 0xc) >> 2) + 4, \
348        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
349
350#define _mm256_shuffle_pd(a, b, mask) \
351        (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
352        (mask) & 0x1, \
353        (((mask) & 0x2) >> 1) + 4, \
354        (((mask) & 0x4) >> 2) + 2, \
355        (((mask) & 0x8) >> 3) + 6))
356
357/* Compare */
358#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
359#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
360#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
361#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
362#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
363#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
364#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
365#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
366#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
367#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
368#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
369#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
370#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
371#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
372#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
373#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
374#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
375#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
376#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
377#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
378#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
379#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
380#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
381#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
382#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
383#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
384#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
385#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
386#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
387#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
388#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
389#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
390
391static __inline __m128d __attribute__((__always_inline__, __nodebug__))
392_mm_cmp_pd(__m128d a, __m128d b, const int c)
393{
394  return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c);
395}
396
397static __inline __m128 __attribute__((__always_inline__, __nodebug__))
398_mm_cmp_ps(__m128 a, __m128 b, const int c)
399{
400  return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c);
401}
402
403static __inline __m256d __attribute__((__always_inline__, __nodebug__))
404_mm256_cmp_pd(__m256d a, __m256d b, const int c)
405{
406  return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c);
407}
408
409static __inline __m256 __attribute__((__always_inline__, __nodebug__))
410_mm256_cmp_ps(__m256 a, __m256 b, const int c)
411{
412  return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c);
413}
414
415static __inline __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cmp_sd(__m128d a, __m128d b, const int c)
417{
418  return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c);
419}
420
421static __inline __m128 __attribute__((__always_inline__, __nodebug__))
422_mm_cmp_ss(__m128 a, __m128 b, const int c)
423{
424  return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c);
425}
426
427/* Vector extract */
428static __inline __m128d __attribute__((__always_inline__, __nodebug__))
429_mm256_extractf128_pd(__m256d a, const int o)
430{
431  return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
432}
433
434static __inline __m128 __attribute__((__always_inline__, __nodebug__))
435_mm256_extractf128_ps(__m256 a, const int o)
436{
437  return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
438}
439
440static __inline __m128i __attribute__((__always_inline__, __nodebug__))
441_mm256_extractf128_si256(__m256i a, const int o)
442{
443  return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
444}
445
446static __inline int __attribute__((__always_inline__, __nodebug__))
447_mm256_extract_epi32(__m256i a, int const imm)
448{
449  __v8si b = (__v8si)a;
450  return b[imm];
451}
452
453static __inline int __attribute__((__always_inline__, __nodebug__))
454_mm256_extract_epi16(__m256i a, int const imm)
455{
456  __v16hi b = (__v16hi)a;
457  return b[imm];
458}
459
460static __inline int __attribute__((__always_inline__, __nodebug__))
461_mm256_extract_epi8(__m256i a, int const imm)
462{
463  __v32qi b = (__v32qi)a;
464  return b[imm];
465}
466
467#ifdef __x86_64__
468static __inline long long  __attribute__((__always_inline__, __nodebug__))
469_mm256_extract_epi64(__m256i a, const int imm)
470{
471  __v4di b = (__v4di)a;
472  return b[imm];
473}
474#endif
475
476/* Vector insert */
477static __inline __m256d __attribute__((__always_inline__, __nodebug__))
478_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
479{
480  return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
481}
482
483static __inline __m256 __attribute__((__always_inline__, __nodebug__))
484_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
485{
486  return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
487}
488
489static __inline __m256i __attribute__((__always_inline__, __nodebug__))
490_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
491{
492  return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
493}
494
495static __inline __m256i __attribute__((__always_inline__, __nodebug__))
496_mm256_insert_epi32(__m256i a, int b, int const imm)
497{
498  __v8si c = (__v8si)a;
499  c[imm & 7] = b;
500  return (__m256i)c;
501}
502
503static __inline __m256i __attribute__((__always_inline__, __nodebug__))
504_mm256_insert_epi16(__m256i a, int b, int const imm)
505{
506  __v16hi c = (__v16hi)a;
507  c[imm & 15] = b;
508  return (__m256i)c;
509}
510
511static __inline __m256i __attribute__((__always_inline__, __nodebug__))
512_mm256_insert_epi8(__m256i a, int b, int const imm)
513{
514  __v32qi c = (__v32qi)a;
515  c[imm & 31] = b;
516  return (__m256i)c;
517}
518
519#ifdef __x86_64__
520static __inline __m256i __attribute__((__always_inline__, __nodebug__))
521_mm256_insert_epi64(__m256i a, int b, int const imm)
522{
523  __v4di c = (__v4di)a;
524  c[imm & 3] = b;
525  return (__m256i)c;
526}
527#endif
528
529/* Conversion */
530static __inline __m256d __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtepi32_pd(__m128i a)
532{
533  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
534}
535
536static __inline __m256 __attribute__((__always_inline__, __nodebug__))
537_mm256_cvtepi32_ps(__m256i a)
538{
539  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
540}
541
542static __inline __m128 __attribute__((__always_inline__, __nodebug__))
543_mm256_cvtpd_ps(__m256d a)
544{
545  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
546}
547
548static __inline __m256i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvtps_epi32(__m256 a)
550{
551  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
552}
553
554static __inline __m256d __attribute__((__always_inline__, __nodebug__))
555_mm256_cvtps_pd(__m128 a)
556{
557  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
558}
559
560static __inline __m128i __attribute__((__always_inline__, __nodebug__))
561_mm256_cvttpd_epi32(__m256d a)
562{
563  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
564}
565
566static __inline __m128i __attribute__((__always_inline__, __nodebug__))
567_mm256_cvtpd_epi32(__m256d a)
568{
569  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
570}
571
572static __inline __m256i __attribute__((__always_inline__, __nodebug__))
573_mm256_cvttps_epi32(__m256 a)
574{
575  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
576}
577
578/* Vector replicate */
579static __inline __m256 __attribute__((__always_inline__, __nodebug__))
580_mm256_movehdup_ps(__m256 a)
581{
582  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
583}
584
585static __inline __m256 __attribute__((__always_inline__, __nodebug__))
586_mm256_moveldup_ps(__m256 a)
587{
588  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
589}
590
591static __inline __m256d __attribute__((__always_inline__, __nodebug__))
592_mm256_movedup_pd(__m256d a)
593{
594  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
595}
596
597/* Unpack and Interleave */
598static __inline __m256d __attribute__((__always_inline__, __nodebug__))
599_mm256_unpackhi_pd(__m256d a, __m256d b)
600{
601  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
602}
603
604static __inline __m256d __attribute__((__always_inline__, __nodebug__))
605_mm256_unpacklo_pd(__m256d a, __m256d b)
606{
607  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
608}
609
610static __inline __m256 __attribute__((__always_inline__, __nodebug__))
611_mm256_unpackhi_ps(__m256 a, __m256 b)
612{
613  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
614}
615
616static __inline __m256 __attribute__((__always_inline__, __nodebug__))
617_mm256_unpacklo_ps(__m256 a, __m256 b)
618{
619  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
620}
621
622/* Bit Test */
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testz_pd(__m128d a, __m128d b)
625{
626  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testc_pd(__m128d a, __m128d b)
631{
632  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm_testnzc_pd(__m128d a, __m128d b)
637{
638  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm_testz_ps(__m128 a, __m128 b)
643{
644  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm_testc_ps(__m128 a, __m128 b)
649{
650  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm_testnzc_ps(__m128 a, __m128 b)
655{
656  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testz_pd(__m256d a, __m256d b)
661{
662  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testc_pd(__m256d a, __m256d b)
667{
668  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testnzc_pd(__m256d a, __m256d b)
673{
674  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testz_ps(__m256 a, __m256 b)
679{
680  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testc_ps(__m256 a, __m256 b)
685{
686  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
687}
688
689static __inline int __attribute__((__always_inline__, __nodebug__))
690_mm256_testnzc_ps(__m256 a, __m256 b)
691{
692  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
693}
694
695static __inline int __attribute__((__always_inline__, __nodebug__))
696_mm256_testz_si256(__m256i a, __m256i b)
697{
698  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
699}
700
701static __inline int __attribute__((__always_inline__, __nodebug__))
702_mm256_testc_si256(__m256i a, __m256i b)
703{
704  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
705}
706
707static __inline int __attribute__((__always_inline__, __nodebug__))
708_mm256_testnzc_si256(__m256i a, __m256i b)
709{
710  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
711}
712
713/* Vector extract sign mask */
714static __inline int __attribute__((__always_inline__, __nodebug__))
715_mm256_movemask_pd(__m256d a)
716{
717  return __builtin_ia32_movmskpd256((__v4df)a);
718}
719
720static __inline int __attribute__((__always_inline__, __nodebug__))
721_mm256_movemask_ps(__m256 a)
722{
723  return __builtin_ia32_movmskps256((__v8sf)a);
724}
725
726/* Vector zero */
727static __inline void __attribute__((__always_inline__, __nodebug__))
728_mm256_zeroall(void)
729{
730  __builtin_ia32_vzeroall();
731}
732
733static __inline void __attribute__((__always_inline__, __nodebug__))
734_mm256_zeroupper(void)
735{
736  __builtin_ia32_vzeroupper();
737}
738
739/* Vector load with broadcast */
740static __inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_broadcast_ss(float const *a)
742{
743  return (__m128)__builtin_ia32_vbroadcastss(a);
744}
745
746static __inline __m256d __attribute__((__always_inline__, __nodebug__))
747_mm256_broadcast_sd(double const *a)
748{
749  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
750}
751
752static __inline __m256 __attribute__((__always_inline__, __nodebug__))
753_mm256_broadcast_ss(float const *a)
754{
755  return (__m256)__builtin_ia32_vbroadcastss256(a);
756}
757
758static __inline __m256d __attribute__((__always_inline__, __nodebug__))
759_mm256_broadcast_pd(__m128d const *a)
760{
761  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
762}
763
764static __inline __m256 __attribute__((__always_inline__, __nodebug__))
765_mm256_broadcast_ps(__m128 const *a)
766{
767  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
768}
769
770/* SIMD load ops */
771static __inline __m256d __attribute__((__always_inline__, __nodebug__))
772_mm256_load_pd(double const *p)
773{
774  return *(__m256d *)p;
775}
776
777static __inline __m256 __attribute__((__always_inline__, __nodebug__))
778_mm256_load_ps(float const *p)
779{
780  return *(__m256 *)p;
781}
782
783static __inline __m256d __attribute__((__always_inline__, __nodebug__))
784_mm256_loadu_pd(double const *p)
785{
786  return (__m256d)__builtin_ia32_loadupd256(p);
787}
788
789static __inline __m256 __attribute__((__always_inline__, __nodebug__))
790_mm256_loadu_ps(float const *p)
791{
792  return (__m256)__builtin_ia32_loadups256(p);
793}
794
795static __inline __m256i __attribute__((__always_inline__, __nodebug__))
796_mm256_load_si256(__m256i const *p)
797{
798  return *p;
799}
800
801static __inline __m256i __attribute__((__always_inline__, __nodebug__))
802_mm256_loadu_si256(__m256i const *p)
803{
804  return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
805}
806
807static __inline __m256i __attribute__((__always_inline__, __nodebug__))
808_mm256_lddqu_si256(__m256i const *p)
809{
810  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
811}
812
813/* SIMD store ops */
814static __inline void __attribute__((__always_inline__, __nodebug__))
815_mm256_store_pd(double *p, __m256d a)
816{
817  *(__m256d *)p = a;
818}
819
820static __inline void __attribute__((__always_inline__, __nodebug__))
821_mm256_store_ps(float *p, __m256 a)
822{
823  *(__m256 *)p = a;
824}
825
826static __inline void __attribute__((__always_inline__, __nodebug__))
827_mm256_storeu_pd(double *p, __m256d a)
828{
829  __builtin_ia32_storeupd256(p, (__v4df)a);
830}
831
832static __inline void __attribute__((__always_inline__, __nodebug__))
833_mm256_storeu_ps(float *p, __m256 a)
834{
835  __builtin_ia32_storeups256(p, (__v8sf)a);
836}
837
838static __inline void __attribute__((__always_inline__, __nodebug__))
839_mm256_store_si256(__m256i *p, __m256i a)
840{
841  *p = a;
842}
843
844static __inline void __attribute__((__always_inline__, __nodebug__))
845_mm256_storeu_si256(__m256i *p, __m256i a)
846{
847  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
848}
849
850/* Conditional load ops */
851static __inline __m128d __attribute__((__always_inline__, __nodebug__))
852_mm_maskload_pd(double const *p, __m128d m)
853{
854  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
855}
856
857static __inline __m256d __attribute__((__always_inline__, __nodebug__))
858_mm256_maskload_pd(double const *p, __m256d m)
859{
860  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
861}
862
863static __inline __m128 __attribute__((__always_inline__, __nodebug__))
864_mm_maskload_ps(float const *p, __m128 m)
865{
866  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
867}
868
869static __inline __m256 __attribute__((__always_inline__, __nodebug__))
870_mm256_maskload_ps(float const *p, __m256 m)
871{
872  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
873}
874
875/* Conditional store ops */
876static __inline void __attribute__((__always_inline__, __nodebug__))
877_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
878{
879  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
880}
881
882static __inline void __attribute__((__always_inline__, __nodebug__))
883_mm_maskstore_pd(double *p, __m128d m, __m128d a)
884{
885  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
886}
887
888static __inline void __attribute__((__always_inline__, __nodebug__))
889_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
890{
891  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
892}
893
894static __inline void __attribute__((__always_inline__, __nodebug__))
895_mm_maskstore_ps(float *p, __m128 m, __m128 a)
896{
897  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
898}
899
900/* Cacheability support ops */
901static __inline void __attribute__((__always_inline__, __nodebug__))
902_mm256_stream_si256(__m256i *a, __m256i b)
903{
904  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
905}
906
907static __inline void __attribute__((__always_inline__, __nodebug__))
908_mm256_stream_pd(double *a, __m256d b)
909{
910  __builtin_ia32_movntpd256(a, (__v4df)b);
911}
912
913static __inline void __attribute__((__always_inline__, __nodebug__))
914_mm256_stream_ps(float *p, __m256 a)
915{
916  __builtin_ia32_movntps256(p, (__v8sf)a);
917}
918
919/* Create vectors */
920static __inline __m256d __attribute__((__always_inline__, __nodebug__))
921_mm256_set_pd(double a, double b, double c, double d)
922{
923  return (__m256d){ d, c, b, a };
924}
925
926static __inline __m256 __attribute__((__always_inline__, __nodebug__))
927_mm256_set_ps(float a, float b, float c, float d,
928	            float e, float f, float g, float h)
929{
930  return (__m256){ h, g, f, e, d, c, b, a };
931}
932
933static __inline __m256i __attribute__((__always_inline__, __nodebug__))
934_mm256_set_epi32(int i0, int i1, int i2, int i3,
935		             int i4, int i5, int i6, int i7)
936{
937  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
938}
939
940static __inline __m256i __attribute__((__always_inline__, __nodebug__))
941_mm256_set_epi16(short w15, short w14, short w13, short w12,
942		             short w11, short w10, short w09, short w08,
943		             short w07, short w06, short w05, short w04,
944		             short w03, short w02, short w01, short w00)
945{
946  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
947                             w08, w09, w10, w11, w12, w13, w14, w15 };
948}
949
950static __inline __m256i __attribute__((__always_inline__, __nodebug__))
951_mm256_set_epi8(char b31, char b30, char b29, char b28,
952		            char b27, char b26, char b25, char b24,
953		            char b23, char b22, char b21, char b20,
954		            char b19, char b18, char b17, char b16,
955		            char b15, char b14, char b13, char b12,
956		            char b11, char b10, char b09, char b08,
957		            char b07, char b06, char b05, char b04,
958		            char b03, char b02, char b01, char b00)
959{
960  return (__m256i)(__v32qi){
961    b00, b01, b02, b03, b04, b05, b06, b07,
962    b08, b09, b10, b11, b12, b13, b14, b15,
963    b16, b17, b18, b19, b20, b21, b22, b23,
964    b24, b25, b26, b27, b28, b29, b30, b31
965  };
966}
967
968static __inline __m256i __attribute__((__always_inline__, __nodebug__))
969_mm256_set_epi64x(long long a, long long b, long long c, long long d)
970{
971  return (__m256i)(__v4di){ d, c, b, a };
972}
973
974/* Create vectors with elements in reverse order */
975static __inline __m256d __attribute__((__always_inline__, __nodebug__))
976_mm256_setr_pd(double a, double b, double c, double d)
977{
978  return (__m256d){ a, b, c, d };
979}
980
981static __inline __m256 __attribute__((__always_inline__, __nodebug__))
982_mm256_setr_ps(float a, float b, float c, float d,
983		           float e, float f, float g, float h)
984{
985  return (__m256){ a, b, c, d, e, f, g, h };
986}
987
988static __inline __m256i __attribute__((__always_inline__, __nodebug__))
989_mm256_setr_epi32(int i0, int i1, int i2, int i3,
990		              int i4, int i5, int i6, int i7)
991{
992  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
993}
994
995static __inline __m256i __attribute__((__always_inline__, __nodebug__))
996_mm256_setr_epi16(short w15, short w14, short w13, short w12,
997		   short w11, short w10, short w09, short w08,
998		   short w07, short w06, short w05, short w04,
999		   short w03, short w02, short w01, short w00)
1000{
1001  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
1002			                       w07, w06, w05, w04, w03, w02, w01, w00 };
1003}
1004
1005static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1006_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1007		             char b27, char b26, char b25, char b24,
1008		             char b23, char b22, char b21, char b20,
1009		             char b19, char b18, char b17, char b16,
1010		             char b15, char b14, char b13, char b12,
1011		             char b11, char b10, char b09, char b08,
1012		             char b07, char b06, char b05, char b04,
1013		             char b03, char b02, char b01, char b00)
1014{
1015  return (__m256i)(__v32qi){
1016    b31, b30, b29, b28, b27, b26, b25, b24,
1017		b23, b22, b21, b20, b19, b18, b17, b16,
1018		b15, b14, b13, b12, b11, b10, b09, b08,
1019		b07, b06, b05, b04, b03, b02, b01, b00 };
1020}
1021
1022static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1023_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1024{
1025  return (__m256i)(__v4di){ a, b, c, d };
1026}
1027
1028/* Create vectors with repeated elements */
1029static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1030_mm256_set1_pd(double w)
1031{
1032  return (__m256d){ w, w, w, w };
1033}
1034
1035static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1036_mm256_set1_ps(float w)
1037{
1038  return (__m256){ w, w, w, w, w, w, w, w };
1039}
1040
1041static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1042_mm256_set1_epi32(int i)
1043{
1044  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1045}
1046
1047static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1048_mm256_set1_epi16(short w)
1049{
1050  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1051}
1052
1053static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1054_mm256_set1_epi8(char b)
1055{
1056  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1057                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1058}
1059
1060static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1061_mm256_set1_epi64x(long long q)
1062{
1063  return (__m256i)(__v4di){ q, q, q, q };
1064}
1065
1066/* Create zeroed vectors */
1067static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1068_mm256_setzero_pd(void)
1069{
1070  return (__m256d){ 0, 0, 0, 0 };
1071}
1072
1073static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1074_mm256_setzero_ps(void)
1075{
1076  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1077}
1078
1079static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1080_mm256_setzero_si256(void)
1081{
1082  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1083}
1084
1085/* Cast between vector types */
1086static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1087_mm256_castpd_ps(__m256d in)
1088{
1089  return (__m256)in;
1090}
1091
1092static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1093_mm256_castpd_si256(__m256d in)
1094{
1095  return (__m256i)in;
1096}
1097
1098static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1099_mm256_castps_pd(__m256 in)
1100{
1101  return (__m256d)in;
1102}
1103
1104static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1105_mm256_castps_si256(__m256 in)
1106{
1107  return (__m256i)in;
1108}
1109
1110static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1111_mm256_castsi256_ps(__m256i in)
1112{
1113  return (__m256)in;
1114}
1115
1116static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1117_mm256_castsi256_pd(__m256i in)
1118{
1119  return (__m256d)in;
1120}
1121
1122static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1123_mm256_castpd256_pd128(__m256d in)
1124{
1125  return __builtin_shufflevector(in, in, 0, 1);
1126}
1127
1128static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1129_mm256_castps256_ps128(__m256 in)
1130{
1131  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1132}
1133
1134static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1135_mm256_castsi256_si128(__m256i in)
1136{
1137  return __builtin_shufflevector(in, in, 0, 1);
1138}
1139
1140static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1141_mm256_castpd128_pd256(__m128d in)
1142{
1143  __m128d zero = _mm_setzero_pd();
1144  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1145}
1146
1147static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1148_mm256_castps128_ps256(__m128 in)
1149{
1150  __m128 zero = _mm_setzero_ps();
1151  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1152}
1153
1154static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1155_mm256_castsi128_si256(__m128i in)
1156{
1157  __m128i zero = _mm_setzero_si128();
1158  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1159}
1160
1161#endif /* __AVX__ */
1162
1163#endif /* __AVXINTRIN_H */
1164