avxintrin.h revision 34a1da4354959522cd1721ce9ca099cc5c743f01
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148#define _mm256_round_pd(V, M) __extension__ ({ \
149    __m256d __V = (V); \
150    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
151
152#define _mm256_round_ps(V, M) __extension__ ({ \
153  __m256 __V = (V); \
154  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
155
156#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
165  return (__m256d)((__v4di)a & (__v4di)b);
166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
171  return (__m256)((__v8si)a & (__v8si)b);
172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
177  return (__m256d)(~(__v4di)a & (__v4di)b);
178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
183  return (__m256)(~(__v8si)a & (__v8si)b);
184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
189  return (__m256d)((__v4di)a | (__v4di)b);
190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
195  return (__m256)((__v8si)a | (__v8si)b);
196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
201  return (__m256d)((__v4di)a ^ (__v4di)b);
202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
207  return (__m256)((__v8si)a ^ (__v8si)b);
208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258						  (__v8si)c);
259}
260
261#define _mm_permute_pd(A, C) __extension__ ({ \
262  __m128d __A = (A); \
263  (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, (C)); })
264
265#define _mm256_permute_pd(A, C) __extension__ ({ \
266  __m256d __A = (A); \
267  (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, (C)); })
268
269#define _mm_permute_ps(A, C) __extension__ ({ \
270  __m128 __A = (A); \
271  (__m128)__builtin_ia32_vpermilps((__v4sf)__A, (C)); })
272
273#define _mm256_permute_ps(A, C) __extension__ ({ \
274  __m256 __A = (A); \
275  (__m256)__builtin_ia32_vpermilps256((__v8sf)__A, (C)); })
276
277#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
278  __m256d __V1 = (V1); \
279  __m256d __V2 = (V2); \
280  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
281
282#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
283  __m256 __V1 = (V1); \
284  __m256 __V2 = (V2); \
285  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
286
287#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
288  __m256i __V1 = (V1); \
289  __m256i __V2 = (V2); \
290  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
291
292/* Vector Blend */
293#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
294  __m256d __V1 = (V1); \
295  __m256d __V2 = (V2); \
296  (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
297
298#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
299  __m256 __V1 = (V1); \
300  __m256 __V2 = (V2); \
301  (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
302
303static __inline __m256d __attribute__((__always_inline__, __nodebug__))
304_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
305{
306  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
307}
308
309static __inline __m256 __attribute__((__always_inline__, __nodebug__))
310_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
311{
312  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
313}
314
315/* Vector Dot Product */
316#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
317  __m256 __V1 = (V1); \
318  __m256 __V2 = (V2); \
319  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
320
321/* Vector shuffle */
322#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
323        __m256 __a = (a); \
324        __m256 __b = (b); \
325        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
326        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
327        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
328        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
329        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
330
331#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
332        __m256d __a = (a); \
333        __m256d __b = (b); \
334        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
335        (mask) & 0x1, \
336        (((mask) & 0x2) >> 1) + 4, \
337        (((mask) & 0x4) >> 2) + 2, \
338        (((mask) & 0x8) >> 3) + 6); })
339
340/* Compare */
341#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
342#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
343#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
344#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
345#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
346#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
347#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
348#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
349#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
350#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
351#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
352#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
353#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
354#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
355#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
356#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
357#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
358#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
359#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
360#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
361#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
362#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
363#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
364#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
365#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
366#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
367#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
368#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
369#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
370#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
371#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
372#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
373
374#define _mm_cmp_pd(a, b, c) __extension__ ({ \
375  __m128d __a = (a); \
376  __m128d __b = (b); \
377  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
378
379#define _mm_cmp_ps(a, b, c) __extension__ ({ \
380  __m128 __a = (a); \
381  __m128 __b = (b); \
382  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
383
384#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
385  __m256d __a = (a); \
386  __m256d __b = (b); \
387  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
388
389#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
390  __m256 __a = (a); \
391  __m256 __b = (b); \
392  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
393
394#define _mm_cmp_sd(a, b, c) __extension__ ({ \
395  __m128d __a = (a); \
396  __m128d __b = (b); \
397  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
398
399#define _mm_cmp_ss(a, b, c) __extension__ ({ \
400  __m128 __a = (a); \
401  __m128 __b = (b); \
402  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
403
404/* Vector extract */
405#define _mm256_extractf128_pd(A, O) __extension__ ({ \
406  __m256d __A = (A); \
407  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
408
409#define _mm256_extractf128_ps(A, O) __extension__ ({ \
410  __m256 __A = (A); \
411  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
412
413#define _mm256_extractf128_si256(A, O) __extension__ ({ \
414  __m256i __A = (A); \
415  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
416
417static __inline int __attribute__((__always_inline__, __nodebug__))
418_mm256_extract_epi32(__m256i a, int const imm)
419{
420  __v8si b = (__v8si)a;
421  return b[imm];
422}
423
424static __inline int __attribute__((__always_inline__, __nodebug__))
425_mm256_extract_epi16(__m256i a, int const imm)
426{
427  __v16hi b = (__v16hi)a;
428  return b[imm];
429}
430
431static __inline int __attribute__((__always_inline__, __nodebug__))
432_mm256_extract_epi8(__m256i a, int const imm)
433{
434  __v32qi b = (__v32qi)a;
435  return b[imm];
436}
437
438#ifdef __x86_64__
439static __inline long long  __attribute__((__always_inline__, __nodebug__))
440_mm256_extract_epi64(__m256i a, const int imm)
441{
442  __v4di b = (__v4di)a;
443  return b[imm];
444}
445#endif
446
447/* Vector insert */
448#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
449  __m256d __V1 = (V1); \
450  __m128d __V2 = (V2); \
451  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
452
453#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
454  __m256 __V1 = (V1); \
455  __m128 __V2 = (V2); \
456  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
457
458#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
459  __m256i __V1 = (V1); \
460  __m128i __V2 = (V2); \
461  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
462
463static __inline __m256i __attribute__((__always_inline__, __nodebug__))
464_mm256_insert_epi32(__m256i a, int b, int const imm)
465{
466  __v8si c = (__v8si)a;
467  c[imm & 7] = b;
468  return (__m256i)c;
469}
470
471static __inline __m256i __attribute__((__always_inline__, __nodebug__))
472_mm256_insert_epi16(__m256i a, int b, int const imm)
473{
474  __v16hi c = (__v16hi)a;
475  c[imm & 15] = b;
476  return (__m256i)c;
477}
478
479static __inline __m256i __attribute__((__always_inline__, __nodebug__))
480_mm256_insert_epi8(__m256i a, int b, int const imm)
481{
482  __v32qi c = (__v32qi)a;
483  c[imm & 31] = b;
484  return (__m256i)c;
485}
486
487#ifdef __x86_64__
488static __inline __m256i __attribute__((__always_inline__, __nodebug__))
489_mm256_insert_epi64(__m256i a, int b, int const imm)
490{
491  __v4di c = (__v4di)a;
492  c[imm & 3] = b;
493  return (__m256i)c;
494}
495#endif
496
497/* Conversion */
498static __inline __m256d __attribute__((__always_inline__, __nodebug__))
499_mm256_cvtepi32_pd(__m128i a)
500{
501  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
502}
503
504static __inline __m256 __attribute__((__always_inline__, __nodebug__))
505_mm256_cvtepi32_ps(__m256i a)
506{
507  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
508}
509
510static __inline __m128 __attribute__((__always_inline__, __nodebug__))
511_mm256_cvtpd_ps(__m256d a)
512{
513  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
514}
515
516static __inline __m256i __attribute__((__always_inline__, __nodebug__))
517_mm256_cvtps_epi32(__m256 a)
518{
519  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
520}
521
522static __inline __m256d __attribute__((__always_inline__, __nodebug__))
523_mm256_cvtps_pd(__m128 a)
524{
525  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
526}
527
528static __inline __m128i __attribute__((__always_inline__, __nodebug__))
529_mm256_cvttpd_epi32(__m256d a)
530{
531  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
532}
533
534static __inline __m128i __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtpd_epi32(__m256d a)
536{
537  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
538}
539
540static __inline __m256i __attribute__((__always_inline__, __nodebug__))
541_mm256_cvttps_epi32(__m256 a)
542{
543  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
544}
545
546/* Vector replicate */
547static __inline __m256 __attribute__((__always_inline__, __nodebug__))
548_mm256_movehdup_ps(__m256 a)
549{
550  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
551}
552
553static __inline __m256 __attribute__((__always_inline__, __nodebug__))
554_mm256_moveldup_ps(__m256 a)
555{
556  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
557}
558
559static __inline __m256d __attribute__((__always_inline__, __nodebug__))
560_mm256_movedup_pd(__m256d a)
561{
562  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
563}
564
565/* Unpack and Interleave */
566static __inline __m256d __attribute__((__always_inline__, __nodebug__))
567_mm256_unpackhi_pd(__m256d a, __m256d b)
568{
569  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
570}
571
572static __inline __m256d __attribute__((__always_inline__, __nodebug__))
573_mm256_unpacklo_pd(__m256d a, __m256d b)
574{
575  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
576}
577
578static __inline __m256 __attribute__((__always_inline__, __nodebug__))
579_mm256_unpackhi_ps(__m256 a, __m256 b)
580{
581  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
582}
583
584static __inline __m256 __attribute__((__always_inline__, __nodebug__))
585_mm256_unpacklo_ps(__m256 a, __m256 b)
586{
587  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
588}
589
590/* Bit Test */
591static __inline int __attribute__((__always_inline__, __nodebug__))
592_mm_testz_pd(__m128d a, __m128d b)
593{
594  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
595}
596
597static __inline int __attribute__((__always_inline__, __nodebug__))
598_mm_testc_pd(__m128d a, __m128d b)
599{
600  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
601}
602
603static __inline int __attribute__((__always_inline__, __nodebug__))
604_mm_testnzc_pd(__m128d a, __m128d b)
605{
606  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
607}
608
609static __inline int __attribute__((__always_inline__, __nodebug__))
610_mm_testz_ps(__m128 a, __m128 b)
611{
612  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
613}
614
615static __inline int __attribute__((__always_inline__, __nodebug__))
616_mm_testc_ps(__m128 a, __m128 b)
617{
618  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testnzc_ps(__m128 a, __m128 b)
623{
624  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm256_testz_pd(__m256d a, __m256d b)
629{
630  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm256_testc_pd(__m256d a, __m256d b)
635{
636  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm256_testnzc_pd(__m256d a, __m256d b)
641{
642  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm256_testz_ps(__m256 a, __m256 b)
647{
648  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm256_testc_ps(__m256 a, __m256 b)
653{
654  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testnzc_ps(__m256 a, __m256 b)
659{
660  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testz_si256(__m256i a, __m256i b)
665{
666  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testc_si256(__m256i a, __m256i b)
671{
672  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testnzc_si256(__m256i a, __m256i b)
677{
678  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
679}
680
681/* Vector extract sign mask */
682static __inline int __attribute__((__always_inline__, __nodebug__))
683_mm256_movemask_pd(__m256d a)
684{
685  return __builtin_ia32_movmskpd256((__v4df)a);
686}
687
688static __inline int __attribute__((__always_inline__, __nodebug__))
689_mm256_movemask_ps(__m256 a)
690{
691  return __builtin_ia32_movmskps256((__v8sf)a);
692}
693
694/* Vector zero */
695static __inline void __attribute__((__always_inline__, __nodebug__))
696_mm256_zeroall(void)
697{
698  __builtin_ia32_vzeroall();
699}
700
701static __inline void __attribute__((__always_inline__, __nodebug__))
702_mm256_zeroupper(void)
703{
704  __builtin_ia32_vzeroupper();
705}
706
707/* Vector load with broadcast */
708static __inline __m128 __attribute__((__always_inline__, __nodebug__))
709_mm_broadcast_ss(float const *a)
710{
711  return (__m128)__builtin_ia32_vbroadcastss(a);
712}
713
714static __inline __m256d __attribute__((__always_inline__, __nodebug__))
715_mm256_broadcast_sd(double const *a)
716{
717  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
718}
719
720static __inline __m256 __attribute__((__always_inline__, __nodebug__))
721_mm256_broadcast_ss(float const *a)
722{
723  return (__m256)__builtin_ia32_vbroadcastss256(a);
724}
725
726static __inline __m256d __attribute__((__always_inline__, __nodebug__))
727_mm256_broadcast_pd(__m128d const *a)
728{
729  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
730}
731
732static __inline __m256 __attribute__((__always_inline__, __nodebug__))
733_mm256_broadcast_ps(__m128 const *a)
734{
735  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
736}
737
738/* SIMD load ops */
739static __inline __m256d __attribute__((__always_inline__, __nodebug__))
740_mm256_load_pd(double const *p)
741{
742  return *(__m256d *)p;
743}
744
745static __inline __m256 __attribute__((__always_inline__, __nodebug__))
746_mm256_load_ps(float const *p)
747{
748  return *(__m256 *)p;
749}
750
751static __inline __m256d __attribute__((__always_inline__, __nodebug__))
752_mm256_loadu_pd(double const *p)
753{
754  return (__m256d)__builtin_ia32_loadupd256(p);
755}
756
757static __inline __m256 __attribute__((__always_inline__, __nodebug__))
758_mm256_loadu_ps(float const *p)
759{
760  return (__m256)__builtin_ia32_loadups256(p);
761}
762
763static __inline __m256i __attribute__((__always_inline__, __nodebug__))
764_mm256_load_si256(__m256i const *p)
765{
766  return *p;
767}
768
769static __inline __m256i __attribute__((__always_inline__, __nodebug__))
770_mm256_loadu_si256(__m256i const *p)
771{
772  return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
773}
774
775static __inline __m256i __attribute__((__always_inline__, __nodebug__))
776_mm256_lddqu_si256(__m256i const *p)
777{
778  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
779}
780
781/* SIMD store ops */
782static __inline void __attribute__((__always_inline__, __nodebug__))
783_mm256_store_pd(double *p, __m256d a)
784{
785  *(__m256d *)p = a;
786}
787
788static __inline void __attribute__((__always_inline__, __nodebug__))
789_mm256_store_ps(float *p, __m256 a)
790{
791  *(__m256 *)p = a;
792}
793
794static __inline void __attribute__((__always_inline__, __nodebug__))
795_mm256_storeu_pd(double *p, __m256d a)
796{
797  __builtin_ia32_storeupd256(p, (__v4df)a);
798}
799
800static __inline void __attribute__((__always_inline__, __nodebug__))
801_mm256_storeu_ps(float *p, __m256 a)
802{
803  __builtin_ia32_storeups256(p, (__v8sf)a);
804}
805
806static __inline void __attribute__((__always_inline__, __nodebug__))
807_mm256_store_si256(__m256i *p, __m256i a)
808{
809  *p = a;
810}
811
812static __inline void __attribute__((__always_inline__, __nodebug__))
813_mm256_storeu_si256(__m256i *p, __m256i a)
814{
815  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
816}
817
818/* Conditional load ops */
819static __inline __m128d __attribute__((__always_inline__, __nodebug__))
820_mm_maskload_pd(double const *p, __m128d m)
821{
822  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
823}
824
825static __inline __m256d __attribute__((__always_inline__, __nodebug__))
826_mm256_maskload_pd(double const *p, __m256d m)
827{
828  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
829}
830
831static __inline __m128 __attribute__((__always_inline__, __nodebug__))
832_mm_maskload_ps(float const *p, __m128 m)
833{
834  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
835}
836
837static __inline __m256 __attribute__((__always_inline__, __nodebug__))
838_mm256_maskload_ps(float const *p, __m256 m)
839{
840  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
841}
842
843/* Conditional store ops */
844static __inline void __attribute__((__always_inline__, __nodebug__))
845_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
846{
847  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
848}
849
850static __inline void __attribute__((__always_inline__, __nodebug__))
851_mm_maskstore_pd(double *p, __m128d m, __m128d a)
852{
853  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
854}
855
856static __inline void __attribute__((__always_inline__, __nodebug__))
857_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
858{
859  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
860}
861
862static __inline void __attribute__((__always_inline__, __nodebug__))
863_mm_maskstore_ps(float *p, __m128 m, __m128 a)
864{
865  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
866}
867
868/* Cacheability support ops */
869static __inline void __attribute__((__always_inline__, __nodebug__))
870_mm256_stream_si256(__m256i *a, __m256i b)
871{
872  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
873}
874
875static __inline void __attribute__((__always_inline__, __nodebug__))
876_mm256_stream_pd(double *a, __m256d b)
877{
878  __builtin_ia32_movntpd256(a, (__v4df)b);
879}
880
881static __inline void __attribute__((__always_inline__, __nodebug__))
882_mm256_stream_ps(float *p, __m256 a)
883{
884  __builtin_ia32_movntps256(p, (__v8sf)a);
885}
886
887/* Create vectors */
888static __inline __m256d __attribute__((__always_inline__, __nodebug__))
889_mm256_set_pd(double a, double b, double c, double d)
890{
891  return (__m256d){ d, c, b, a };
892}
893
894static __inline __m256 __attribute__((__always_inline__, __nodebug__))
895_mm256_set_ps(float a, float b, float c, float d,
896	            float e, float f, float g, float h)
897{
898  return (__m256){ h, g, f, e, d, c, b, a };
899}
900
901static __inline __m256i __attribute__((__always_inline__, __nodebug__))
902_mm256_set_epi32(int i0, int i1, int i2, int i3,
903		             int i4, int i5, int i6, int i7)
904{
905  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
906}
907
908static __inline __m256i __attribute__((__always_inline__, __nodebug__))
909_mm256_set_epi16(short w15, short w14, short w13, short w12,
910		             short w11, short w10, short w09, short w08,
911		             short w07, short w06, short w05, short w04,
912		             short w03, short w02, short w01, short w00)
913{
914  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
915                             w08, w09, w10, w11, w12, w13, w14, w15 };
916}
917
918static __inline __m256i __attribute__((__always_inline__, __nodebug__))
919_mm256_set_epi8(char b31, char b30, char b29, char b28,
920		            char b27, char b26, char b25, char b24,
921		            char b23, char b22, char b21, char b20,
922		            char b19, char b18, char b17, char b16,
923		            char b15, char b14, char b13, char b12,
924		            char b11, char b10, char b09, char b08,
925		            char b07, char b06, char b05, char b04,
926		            char b03, char b02, char b01, char b00)
927{
928  return (__m256i)(__v32qi){
929    b00, b01, b02, b03, b04, b05, b06, b07,
930    b08, b09, b10, b11, b12, b13, b14, b15,
931    b16, b17, b18, b19, b20, b21, b22, b23,
932    b24, b25, b26, b27, b28, b29, b30, b31
933  };
934}
935
936static __inline __m256i __attribute__((__always_inline__, __nodebug__))
937_mm256_set_epi64x(long long a, long long b, long long c, long long d)
938{
939  return (__m256i)(__v4di){ d, c, b, a };
940}
941
942/* Create vectors with elements in reverse order */
943static __inline __m256d __attribute__((__always_inline__, __nodebug__))
944_mm256_setr_pd(double a, double b, double c, double d)
945{
946  return (__m256d){ a, b, c, d };
947}
948
949static __inline __m256 __attribute__((__always_inline__, __nodebug__))
950_mm256_setr_ps(float a, float b, float c, float d,
951		           float e, float f, float g, float h)
952{
953  return (__m256){ a, b, c, d, e, f, g, h };
954}
955
956static __inline __m256i __attribute__((__always_inline__, __nodebug__))
957_mm256_setr_epi32(int i0, int i1, int i2, int i3,
958		              int i4, int i5, int i6, int i7)
959{
960  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
961}
962
963static __inline __m256i __attribute__((__always_inline__, __nodebug__))
964_mm256_setr_epi16(short w15, short w14, short w13, short w12,
965		   short w11, short w10, short w09, short w08,
966		   short w07, short w06, short w05, short w04,
967		   short w03, short w02, short w01, short w00)
968{
969  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
970			                       w07, w06, w05, w04, w03, w02, w01, w00 };
971}
972
973static __inline __m256i __attribute__((__always_inline__, __nodebug__))
974_mm256_setr_epi8(char b31, char b30, char b29, char b28,
975		             char b27, char b26, char b25, char b24,
976		             char b23, char b22, char b21, char b20,
977		             char b19, char b18, char b17, char b16,
978		             char b15, char b14, char b13, char b12,
979		             char b11, char b10, char b09, char b08,
980		             char b07, char b06, char b05, char b04,
981		             char b03, char b02, char b01, char b00)
982{
983  return (__m256i)(__v32qi){
984    b31, b30, b29, b28, b27, b26, b25, b24,
985		b23, b22, b21, b20, b19, b18, b17, b16,
986		b15, b14, b13, b12, b11, b10, b09, b08,
987		b07, b06, b05, b04, b03, b02, b01, b00 };
988}
989
990static __inline __m256i __attribute__((__always_inline__, __nodebug__))
991_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
992{
993  return (__m256i)(__v4di){ a, b, c, d };
994}
995
996/* Create vectors with repeated elements */
997static __inline __m256d __attribute__((__always_inline__, __nodebug__))
998_mm256_set1_pd(double w)
999{
1000  return (__m256d){ w, w, w, w };
1001}
1002
1003static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1004_mm256_set1_ps(float w)
1005{
1006  return (__m256){ w, w, w, w, w, w, w, w };
1007}
1008
1009static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1010_mm256_set1_epi32(int i)
1011{
1012  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1013}
1014
1015static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1016_mm256_set1_epi16(short w)
1017{
1018  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1019}
1020
1021static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1022_mm256_set1_epi8(char b)
1023{
1024  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1025                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1026}
1027
1028static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1029_mm256_set1_epi64x(long long q)
1030{
1031  return (__m256i)(__v4di){ q, q, q, q };
1032}
1033
1034/* Create zeroed vectors */
1035static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1036_mm256_setzero_pd(void)
1037{
1038  return (__m256d){ 0, 0, 0, 0 };
1039}
1040
1041static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1042_mm256_setzero_ps(void)
1043{
1044  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1045}
1046
1047static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1048_mm256_setzero_si256(void)
1049{
1050  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1051}
1052
1053/* Cast between vector types */
1054static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1055_mm256_castpd_ps(__m256d in)
1056{
1057  return (__m256)in;
1058}
1059
1060static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1061_mm256_castpd_si256(__m256d in)
1062{
1063  return (__m256i)in;
1064}
1065
1066static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1067_mm256_castps_pd(__m256 in)
1068{
1069  return (__m256d)in;
1070}
1071
1072static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1073_mm256_castps_si256(__m256 in)
1074{
1075  return (__m256i)in;
1076}
1077
1078static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1079_mm256_castsi256_ps(__m256i in)
1080{
1081  return (__m256)in;
1082}
1083
1084static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1085_mm256_castsi256_pd(__m256i in)
1086{
1087  return (__m256d)in;
1088}
1089
1090static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1091_mm256_castpd256_pd128(__m256d in)
1092{
1093  return __builtin_shufflevector(in, in, 0, 1);
1094}
1095
1096static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1097_mm256_castps256_ps128(__m256 in)
1098{
1099  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1100}
1101
1102static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1103_mm256_castsi256_si128(__m256i in)
1104{
1105  return __builtin_shufflevector(in, in, 0, 1);
1106}
1107
1108static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1109_mm256_castpd128_pd256(__m128d in)
1110{
1111  __m128d zero = _mm_setzero_pd();
1112  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1113}
1114
1115static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1116_mm256_castps128_ps256(__m128 in)
1117{
1118  __m128 zero = _mm_setzero_ps();
1119  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1120}
1121
1122static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1123_mm256_castsi128_si256(__m128i in)
1124{
1125  __m128i zero = _mm_setzero_si128();
1126  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1127}
1128