1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38/* Unsigned types */
39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
44/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
46typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
48typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
54
55/* Arithmetic */
56/// \brief Adds two 256-bit vectors of [4 x double].
57///
58/// \headerfile <x86intrin.h>
59///
60/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
61///
62/// \param __a
63///    A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65///    A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67///    operands.
68static __inline __m256d __DEFAULT_FN_ATTRS
69_mm256_add_pd(__m256d __a, __m256d __b)
70{
71  return (__m256d)((__v4df)__a+(__v4df)__b);
72}
73
74/// \brief Adds two 256-bit vectors of [8 x float].
75///
76/// \headerfile <x86intrin.h>
77///
78/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
79///
80/// \param __a
81///    A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83///    A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85///    operands.
86static __inline __m256 __DEFAULT_FN_ATTRS
87_mm256_add_ps(__m256 __a, __m256 __b)
88{
89  return (__m256)((__v8sf)__a+(__v8sf)__b);
90}
91
92/// \brief Subtracts two 256-bit vectors of [4 x double].
93///
94/// \headerfile <x86intrin.h>
95///
96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
97///
98/// \param __a
99///    A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101///    A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103///    both operands.
104static __inline __m256d __DEFAULT_FN_ATTRS
105_mm256_sub_pd(__m256d __a, __m256d __b)
106{
107  return (__m256d)((__v4df)__a-(__v4df)__b);
108}
109
110/// \brief Subtracts two 256-bit vectors of [8 x float].
111///
112/// \headerfile <x86intrin.h>
113///
114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
115///
116/// \param __a
117///    A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119///    A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121///    both operands.
122static __inline __m256 __DEFAULT_FN_ATTRS
123_mm256_sub_ps(__m256 __a, __m256 __b)
124{
125  return (__m256)((__v8sf)__a-(__v8sf)__b);
126}
127
128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129///    two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
134///
135/// \param __a
136///    A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138///    A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140///    and differences between both operands.
141static __inline __m256d __DEFAULT_FN_ATTRS
142_mm256_addsub_pd(__m256d __a, __m256d __b)
143{
144  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145}
146
147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148///    two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
153///
154/// \param __a
155///    A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157///    A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159///    differences between both operands.
160static __inline __m256 __DEFAULT_FN_ATTRS
161_mm256_addsub_ps(__m256 __a, __m256 __b)
162{
163  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
164}
165
166/// \brief Divides two 256-bit vectors of [4 x double].
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
171///
172/// \param __a
173///    A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175///    A 256-bit vector of [4 x double] containing the divisor.
176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177///    operands.
178static __inline __m256d __DEFAULT_FN_ATTRS
179_mm256_div_pd(__m256d __a, __m256d __b)
180{
181  return (__m256d)((__v4df)__a/(__v4df)__b);
182}
183
184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
189///
190/// \param __a
191///    A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193///    A 256-bit vector of [8 x float] containing the divisor.
194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195///    operands.
196static __inline __m256 __DEFAULT_FN_ATTRS
197_mm256_div_ps(__m256 __a, __m256 __b)
198{
199  return (__m256)((__v8sf)__a/(__v8sf)__b);
200}
201
202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203///    of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
208///
209/// \param __a
210///    A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212///    A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214///    between both operands.
215static __inline __m256d __DEFAULT_FN_ATTRS
216_mm256_max_pd(__m256d __a, __m256d __b)
217{
218  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219}
220
221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222///    of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
227///
228/// \param __a
229///    A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231///    A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233///    between both operands.
234static __inline __m256 __DEFAULT_FN_ATTRS
235_mm256_max_ps(__m256 __a, __m256 __b)
236{
237  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238}
239
240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241///    of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
246///
247/// \param __a
248///    A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250///    A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252///    between both operands.
253static __inline __m256d __DEFAULT_FN_ATTRS
254_mm256_min_pd(__m256d __a, __m256d __b)
255{
256  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257}
258
259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260///    of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
265///
266/// \param __a
267///    A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269///    A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271///    between both operands.
272static __inline __m256 __DEFAULT_FN_ATTRS
273_mm256_min_ps(__m256 __a, __m256 __b)
274{
275  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276}
277
278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
283///
284/// \param __a
285///    A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287///    A 256-bit vector of [4 x double] containing one of the operands.
288/// \returns A 256-bit vector of [4 x double] containing the products of both
289///    operands.
290static __inline __m256d __DEFAULT_FN_ATTRS
291_mm256_mul_pd(__m256d __a, __m256d __b)
292{
293  return (__m256d)((__v4df)__a * (__v4df)__b);
294}
295
296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
301///
302/// \param __a
303///    A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305///    A 256-bit vector of [8 x float] containing one of the operands.
306/// \returns A 256-bit vector of [8 x float] containing the products of both
307///    operands.
308static __inline __m256 __DEFAULT_FN_ATTRS
309_mm256_mul_ps(__m256 __a, __m256 __b)
310{
311  return (__m256)((__v8sf)__a * (__v8sf)__b);
312}
313
314/// \brief Calculates the square roots of the values in a 256-bit vector of
315///    [4 x double].
316///
317/// \headerfile <x86intrin.h>
318///
319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
320///
321/// \param __a
322///    A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324///    values in the operand.
325static __inline __m256d __DEFAULT_FN_ATTRS
326_mm256_sqrt_pd(__m256d __a)
327{
328  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329}
330
331/// \brief Calculates the square roots of the values in a 256-bit vector of
332///    [8 x float].
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
337///
338/// \param __a
339///    A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341///    values in the operand.
342static __inline __m256 __DEFAULT_FN_ATTRS
343_mm256_sqrt_ps(__m256 __a)
344{
345  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346}
347
348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349///    vector of [8 x float].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
354///
355/// \param __a
356///    A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358///    roots of the values in the operand.
359static __inline __m256 __DEFAULT_FN_ATTRS
360_mm256_rsqrt_ps(__m256 __a)
361{
362  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363}
364
365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366///    [8 x float].
367///
368/// \headerfile <x86intrin.h>
369///
370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
371///
372/// \param __a
373///    A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375///    values in the operand.
376static __inline __m256 __DEFAULT_FN_ATTRS
377_mm256_rcp_ps(__m256 __a)
378{
379  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380}
381
382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383///    by the byte operand. The source values are rounded to integer values and
384///    returned as 64-bit double-precision floating-point values.
385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
393///
394/// \param V
395///    A 256-bit vector of [4 x double].
396/// \param M
397///    An integer value that specifies the rounding operation. \n
398///    Bits [7:4] are reserved. \n
399///    Bit [3] is a precision exception value: \n
400///      0: A normal PE exception is used. \n
401///      1: The PE field is not updated. \n
402///    Bit [2] is the rounding control source: \n
403///      0: Use bits [1:0] of \a M. \n
404///      1: Use the current MXCSR setting. \n
405///    Bits [1:0] contain the rounding control definition: \n
406///      00: Nearest. \n
407///      01: Downward (toward negative infinity). \n
408///      10: Upward (toward positive infinity). \n
409///      11: Truncated.
410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
411#define _mm256_round_pd(V, M) __extension__ ({ \
412    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
413
414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415///    specified by the byte operand. The source values are rounded to integer
416///    values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
425///
426/// \param V
427///    A 256-bit vector of [8 x float].
428/// \param M
429///    An integer value that specifies the rounding operation. \n
430///    Bits [7:4] are reserved. \n
431///    Bit [3] is a precision exception value: \n
432///      0: A normal PE exception is used. \n
433///      1: The PE field is not updated. \n
434///    Bit [2] is the rounding control source: \n
435///      0: Use bits [1:0] of \a M. \n
436///      1: Use the current MXCSR setting. \n
437///    Bits [1:0] contain the rounding control definition: \n
438///      00: Nearest. \n
439///      01: Downward (toward negative infinity). \n
440///      10: Upward (toward positive infinity). \n
441///      11: Truncated.
442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
443#define _mm256_round_ps(V, M) __extension__ ({ \
444  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
445
446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
447///    source values are rounded up to integer values and returned as 64-bit
448///    double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
457///
458/// \param V
459///    A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
462
463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
464///    The source values are rounded down to integer values and returned as
465///    64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
474///
475/// \param V
476///    A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478///    values.
479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480
481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
482///    source values are rounded up to integer values and returned as
483///    floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
492///
493/// \param V
494///    A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
497
498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
499///    source values are rounded down to integer values and returned as
500///    floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
509///
510/// \param V
511///    A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
521///
522/// \param __a
523///    A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525///    A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527///    values between both operands.
528static __inline __m256d __DEFAULT_FN_ATTRS
529_mm256_and_pd(__m256d __a, __m256d __b)
530{
531  return (__m256d)((__v4du)__a & (__v4du)__b);
532}
533
534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
539///
540/// \param __a
541///    A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543///    A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545///    values between both operands.
546static __inline __m256 __DEFAULT_FN_ATTRS
547_mm256_and_ps(__m256 __a, __m256 __b)
548{
549  return (__m256)((__v8su)__a & (__v8su)__b);
550}
551
552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553///    the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
558///
559/// \param __a
560///    A 256-bit vector of [4 x double] containing the left source operand. The
561///    one's complement of this value is used in the bitwise AND.
562/// \param __b
563///    A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565///    values of the second operand and the one's complement of the first
566///    operand.
567static __inline __m256d __DEFAULT_FN_ATTRS
568_mm256_andnot_pd(__m256d __a, __m256d __b)
569{
570  return (__m256d)(~(__v4du)__a & (__v4du)__b);
571}
572
573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574///    the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
579///
580/// \param __a
581///    A 256-bit vector of [8 x float] containing the left source operand. The
582///    one's complement of this value is used in the bitwise AND.
583/// \param __b
584///    A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586///    values of the second operand and the one's complement of the first
587///    operand.
588static __inline __m256 __DEFAULT_FN_ATTRS
589_mm256_andnot_ps(__m256 __a, __m256 __b)
590{
591  return (__m256)(~(__v8su)__a & (__v8su)__b);
592}
593
594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
599///
600/// \param __a
601///    A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603///    A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605///    values between both operands.
606static __inline __m256d __DEFAULT_FN_ATTRS
607_mm256_or_pd(__m256d __a, __m256d __b)
608{
609  return (__m256d)((__v4du)__a | (__v4du)__b);
610}
611
612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
617///
618/// \param __a
619///    A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621///    A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623///    values between both operands.
624static __inline __m256 __DEFAULT_FN_ATTRS
625_mm256_or_ps(__m256 __a, __m256 __b)
626{
627  return (__m256)((__v8su)__a | (__v8su)__b);
628}
629
630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
635///
636/// \param __a
637///    A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639///    A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641///    values between both operands.
642static __inline __m256d __DEFAULT_FN_ATTRS
643_mm256_xor_pd(__m256d __a, __m256d __b)
644{
645  return (__m256d)((__v4du)__a ^ (__v4du)__b);
646}
647
648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
653///
654/// \param __a
655///    A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657///    A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659///    values between both operands.
660static __inline __m256 __DEFAULT_FN_ATTRS
661_mm256_xor_ps(__m256 __a, __m256 __b)
662{
663  return (__m256)((__v8su)__a ^ (__v8su)__b);
664}
665
666/* Horizontal arithmetic */
667/// \brief Horizontally adds the adjacent pairs of values contained in two
668///    256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
673///
674/// \param __a
675///    A 256-bit vector of [4 x double] containing one of the source operands.
676///    The horizontal sums of the values are returned in the even-indexed
677///    elements of a vector of [4 x double].
678/// \param __b
679///    A 256-bit vector of [4 x double] containing one of the source operands.
680///    The horizontal sums of the values are returned in the odd-indexed
681///    elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683///    both operands.
684static __inline __m256d __DEFAULT_FN_ATTRS
685_mm256_hadd_pd(__m256d __a, __m256d __b)
686{
687  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688}
689
690/// \brief Horizontally adds the adjacent pairs of values contained in two
691///    256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
696///
697/// \param __a
698///    A 256-bit vector of [8 x float] containing one of the source operands.
699///    The horizontal sums of the values are returned in the elements with
700///    index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702///    A 256-bit vector of [8 x float] containing one of the source operands.
703///    The horizontal sums of the values are returned in the elements with
704///    index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706///    both operands.
707static __inline __m256 __DEFAULT_FN_ATTRS
708_mm256_hadd_ps(__m256 __a, __m256 __b)
709{
710  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711}
712
713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714///    256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
719///
720/// \param __a
721///    A 256-bit vector of [4 x double] containing one of the source operands.
722///    The horizontal differences between the values are returned in the
723///    even-indexed elements of a vector of [4 x double].
724/// \param __b
725///    A 256-bit vector of [4 x double] containing one of the source operands.
726///    The horizontal differences between the values are returned in the
727///    odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729///    differences of both operands.
730static __inline __m256d __DEFAULT_FN_ATTRS
731_mm256_hsub_pd(__m256d __a, __m256d __b)
732{
733  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734}
735
736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737///    256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742///
743/// \param __a
744///    A 256-bit vector of [8 x float] containing one of the source operands.
745///    The horizontal differences between the values are returned in the
746///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748///    A 256-bit vector of [8 x float] containing one of the source operands.
749///    The horizontal differences between the values are returned in the
750///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752///    differences of both operands.
753static __inline __m256 __DEFAULT_FN_ATTRS
754_mm256_hsub_ps(__m256 __a, __m256 __b)
755{
756  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757}
758
759/* Vector permutations */
760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761///    by the 128-bit integer vector operand.
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
766///
767/// \param __a
768///    A 128-bit vector of [2 x double].
769/// \param __c
770///    A 128-bit integer vector operand specifying how the values are to be
771///    copied. \n
772///    Bit [1]: \n
773///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774///         vector. \n
775///      1: Bits [127:64] of the source are copied to bits [63:0] of the
776///         returned vector. \n
777///    Bit [65]: \n
778///      0: Bits [63:0] of the source are copied to bits [127:64] of the
779///         returned vector. \n
780///      1: Bits [127:64] of the source are copied to bits [127:64] of the
781///         returned vector.
782/// \returns A 128-bit vector of [2 x double] containing the copied values.
783static __inline __m128d __DEFAULT_FN_ATTRS
784_mm_permutevar_pd(__m128d __a, __m128i __c)
785{
786  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787}
788
789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
790///    by the 256-bit integer vector operand.
791///
792/// \headerfile <x86intrin.h>
793///
794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
795///
796/// \param __a
797///    A 256-bit vector of [4 x double].
798/// \param __c
799///    A 256-bit integer vector operand specifying how the values are to be
800///    copied. \n
801///    Bit [1]: \n
802///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803///         vector. \n
804///      1: Bits [127:64] of the source are copied to bits [63:0] of the
805///         returned vector. \n
806///    Bit [65]: \n
807///      0: Bits [63:0] of the source are copied to bits [127:64] of the
808///         returned vector. \n
809///      1: Bits [127:64] of the source are copied to bits [127:64] of the
810///         returned vector. \n
811///    Bit [129]: \n
812///      0: Bits [191:128] of the source are copied to bits [191:128] of the
813///         returned vector. \n
814///      1: Bits [255:192] of the source are copied to bits [191:128] of the
815///         returned vector. \n
816///    Bit [193]: \n
817///      0: Bits [191:128] of the source are copied to bits [255:192] of the
818///         returned vector. \n
819///      1: Bits [255:192] of the source are copied to bits [255:192] of the
820///    returned vector.
821/// \returns A 256-bit vector of [4 x double] containing the copied values.
822static __inline __m256d __DEFAULT_FN_ATTRS
823_mm256_permutevar_pd(__m256d __a, __m256i __c)
824{
825  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826}
827
828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829///    specified by the 128-bit integer vector operand.
830/// \headerfile <x86intrin.h>
831///
832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
833///
834/// \param __a
835///    A 128-bit vector of [4 x float].
836/// \param __c
837///    A 128-bit integer vector operand specifying how the values are to be
838///    copied. \n
839///    Bits [1:0]: \n
840///      00: Bits [31:0] of the source are copied to bits [31:0] of the
841///          returned vector. \n
842///      01: Bits [63:32] of the source are copied to bits [31:0] of the
843///          returned vector. \n
844///      10: Bits [95:64] of the source are copied to bits [31:0] of the
845///          returned vector. \n
846///      11: Bits [127:96] of the source are copied to bits [31:0] of the
847///          returned vector. \n
848///    Bits [33:32]: \n
849///      00: Bits [31:0] of the source are copied to bits [63:32] of the
850///          returned vector. \n
851///      01: Bits [63:32] of the source are copied to bits [63:32] of the
852///          returned vector. \n
853///      10: Bits [95:64] of the source are copied to bits [63:32] of the
854///          returned vector. \n
855///      11: Bits [127:96] of the source are copied to bits [63:32] of the
856///          returned vector. \n
857///    Bits [65:64]: \n
858///      00: Bits [31:0] of the source are copied to bits [95:64] of the
859///          returned vector. \n
860///      01: Bits [63:32] of the source are copied to bits [95:64] of the
861///          returned vector. \n
862///      10: Bits [95:64] of the source are copied to bits [95:64] of the
863///          returned vector. \n
864///      11: Bits [127:96] of the source are copied to bits [95:64] of the
865///          returned vector. \n
866///    Bits [97:96]: \n
867///      00: Bits [31:0] of the source are copied to bits [127:96] of the
868///          returned vector. \n
869///      01: Bits [63:32] of the source are copied to bits [127:96] of the
870///          returned vector. \n
871///      10: Bits [95:64] of the source are copied to bits [127:96] of the
872///          returned vector. \n
873///      11: Bits [127:96] of the source are copied to bits [127:96] of the
874///          returned vector.
875/// \returns A 128-bit vector of [4 x float] containing the copied values.
876static __inline __m128 __DEFAULT_FN_ATTRS
877_mm_permutevar_ps(__m128 __a, __m128i __c)
878{
879  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
880}
881
882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
883///    specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
888///
889/// \param __a
890///    A 256-bit vector of [8 x float].
891/// \param __c
892///    A 256-bit integer vector operand specifying how the values are to be
893///    copied. \n
894///    Bits [1:0]: \n
895///      00: Bits [31:0] of the source are copied to bits [31:0] of the
896///          returned vector. \n
897///      01: Bits [63:32] of the source are copied to bits [31:0] of the
898///          returned vector. \n
899///      10: Bits [95:64] of the source are copied to bits [31:0] of the
900///          returned vector. \n
901///      11: Bits [127:96] of the source are copied to bits [31:0] of the
902///          returned vector. \n
903///    Bits [33:32]: \n
904///      00: Bits [31:0] of the source are copied to bits [63:32] of the
905///          returned vector. \n
906///      01: Bits [63:32] of the source are copied to bits [63:32] of the
907///          returned vector. \n
908///      10: Bits [95:64] of the source are copied to bits [63:32] of the
909///          returned vector. \n
910///      11: Bits [127:96] of the source are copied to bits [63:32] of the
911///          returned vector. \n
912///    Bits [65:64]: \n
913///      00: Bits [31:0] of the source are copied to bits [95:64] of the
914///          returned vector. \n
915///      01: Bits [63:32] of the source are copied to bits [95:64] of the
916///          returned vector. \n
917///      10: Bits [95:64] of the source are copied to bits [95:64] of the
918///          returned vector. \n
919///      11: Bits [127:96] of the source are copied to bits [95:64] of the
920///          returned vector. \n
921///    Bits [97:96]: \n
922///      00: Bits [31:0] of the source are copied to bits [127:96] of the
923///          returned vector. \n
924///      01: Bits [63:32] of the source are copied to bits [127:96] of the
925///          returned vector. \n
926///      10: Bits [95:64] of the source are copied to bits [127:96] of the
927///          returned vector. \n
928///      11: Bits [127:96] of the source are copied to bits [127:96] of the
929///          returned vector. \n
930///    Bits [129:128]: \n
931///      00: Bits [159:128] of the source are copied to bits [159:128] of the
932///          returned vector. \n
933///      01: Bits [191:160] of the source are copied to bits [159:128] of the
934///          returned vector. \n
935///      10: Bits [223:192] of the source are copied to bits [159:128] of the
936///          returned vector. \n
937///      11: Bits [255:224] of the source are copied to bits [159:128] of the
938///          returned vector. \n
939///    Bits [161:160]: \n
940///      00: Bits [159:128] of the source are copied to bits [191:160] of the
941///          returned vector. \n
942///      01: Bits [191:160] of the source are copied to bits [191:160] of the
943///          returned vector. \n
944///      10: Bits [223:192] of the source are copied to bits [191:160] of the
945///          returned vector. \n
946///      11: Bits [255:224] of the source are copied to bits [191:160] of the
947///          returned vector. \n
948///    Bits [193:192]: \n
949///      00: Bits [159:128] of the source are copied to bits [223:192] of the
950///          returned vector. \n
951///      01: Bits [191:160] of the source are copied to bits [223:192] of the
952///          returned vector. \n
953///      10: Bits [223:192] of the source are copied to bits [223:192] of the
954///          returned vector. \n
955///      11: Bits [255:224] of the source are copied to bits [223:192] of the
956///          returned vector. \n
957///    Bits [225:224]: \n
958///      00: Bits [159:128] of the source are copied to bits [255:224] of the
959///          returned vector. \n
960///      01: Bits [191:160] of the source are copied to bits [255:224] of the
961///          returned vector. \n
962///      10: Bits [223:192] of the source are copied to bits [255:224] of the
963///          returned vector. \n
964///      11: Bits [255:224] of the source are copied to bits [255:224] of the
965///          returned vector.
966/// \returns A 256-bit vector of [8 x float] containing the copied values.
967static __inline __m256 __DEFAULT_FN_ATTRS
968_mm256_permutevar_ps(__m256 __a, __m256i __c)
969{
970  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971}
972
973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
974///    by the immediate integer operand.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983///
984/// \param A
985///    A 128-bit vector of [2 x double].
986/// \param C
987///    An immediate integer operand specifying how the values are to be
988///    copied. \n
989///    Bit [0]: \n
990///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991///         vector. \n
992///      1: Bits [127:64] of the source are copied to bits [63:0] of the
993///         returned vector. \n
994///    Bit [1]: \n
995///      0: Bits [63:0] of the source are copied to bits [127:64] of the
996///         returned vector. \n
997///      1: Bits [127:64] of the source are copied to bits [127:64] of the
998///         returned vector.
999/// \returns A 128-bit vector of [2 x double] containing the copied values.
1000#define _mm_permute_pd(A, C) __extension__ ({ \
1001  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
1002                                   (__v2df)_mm_undefined_pd(), \
1003                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
1004
1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
1006///    the immediate integer operand.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// __m256d _mm256_permute_pd(__m256d A, const int C);
1012/// \endcode
1013///
1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1015///
1016/// \param A
1017///    A 256-bit vector of [4 x double].
1018/// \param C
1019///    An immediate integer operand specifying how the values are to be
1020///    copied. \n
1021///    Bit [0]: \n
1022///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1023///         vector. \n
1024///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1025///         returned vector. \n
1026///    Bit [1]: \n
1027///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1028///         returned vector. \n
1029///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1030///         returned vector. \n
1031///    Bit [2]: \n
1032///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1033///         returned vector. \n
1034///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1035///         returned vector. \n
1036///    Bit [3]: \n
1037///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1038///         returned vector. \n
1039///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1040///         returned vector.
1041/// \returns A 256-bit vector of [4 x double] containing the copied values.
1042#define _mm256_permute_pd(A, C) __extension__ ({ \
1043  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1044                                   (__v4df)_mm256_undefined_pd(), \
1045                                   0 + (((C) >> 0) & 0x1), \
1046                                   0 + (((C) >> 1) & 0x1), \
1047                                   2 + (((C) >> 2) & 0x1), \
1048                                   2 + (((C) >> 3) & 0x1)); })
1049
1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
1051///    the immediate integer operand.
1052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// \code
1056/// __m128 _mm_permute_ps(__m128 A, const int C);
1057/// \endcode
1058///
1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1060///
1061/// \param A
1062///    A 128-bit vector of [4 x float].
1063/// \param C
1064///    An immediate integer operand specifying how the values are to be
1065///    copied. \n
1066///    Bits [1:0]: \n
1067///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1068///          returned vector. \n
1069///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1070///          returned vector. \n
1071///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1072///          returned vector. \n
1073///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1074///          returned vector. \n
1075///    Bits [3:2]: \n
1076///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1077///          returned vector. \n
1078///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1079///          returned vector. \n
1080///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1081///          returned vector. \n
1082///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1083///          returned vector. \n
1084///    Bits [5:4]: \n
1085///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1086///          returned vector. \n
1087///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1088///          returned vector. \n
1089///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1090///          returned vector. \n
1091///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1092///          returned vector. \n
1093///    Bits [7:6]: \n
1094///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1095///          returned vector. \n
1096///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1097///          returned vector. \n
1098///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1099///          returned vector. \n
1100///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1101///          returned vector.
1102/// \returns A 128-bit vector of [4 x float] containing the copied values.
1103#define _mm_permute_ps(A, C) __extension__ ({ \
1104  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1105                                  (__v4sf)_mm_undefined_ps(), \
1106                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1107                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
1108
1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
1110///    the immediate integer operand.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1119///
1120/// \param A
1121///    A 256-bit vector of [8 x float].
1122/// \param C
1123///    An immediate integer operand specifying how the values are to be \n
1124///    copied. \n
1125///    Bits [1:0]: \n
1126///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1127///          returned vector. \n
1128///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1129///          returned vector. \n
1130///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1131///          returned vector. \n
1132///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1133///          returned vector. \n
1134///    Bits [3:2]: \n
1135///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1136///          returned vector. \n
1137///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1138///          returned vector. \n
1139///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1140///          returned vector. \n
1141///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1142///          returned vector. \n
1143///    Bits [5:4]: \n
1144///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1145///          returned vector. \n
1146///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1147///          returned vector. \n
1148///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1149///          returned vector. \n
1150///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1151///          returned vector. \n
1152///    Bits [7:6]: \n
1153///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
1154///          returned vector. \n
1155///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1156///          returned vector. \n
1157///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1158///          returned vector. \n
1159///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1160///          returned vector. \n
1161///    Bits [1:0]: \n
1162///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1163///          returned vector. \n
1164///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1165///          returned vector. \n
1166///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1167///          returned vector. \n
1168///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1169///          returned vector. \n
1170///    Bits [3:2]: \n
1171///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1172///          returned vector. \n
1173///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1174///          returned vector. \n
1175///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1176///          returned vector. \n
1177///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1178///          returned vector. \n
1179///    Bits [5:4]: \n
1180///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1181///          returned vector. \n
1182///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1183///          returned vector. \n
1184///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1185///          returned vector. \n
1186///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1187///          returned vector. \n
1188///    Bits [7:6]: \n
1189///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1190///          returned vector. \n
1191///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1192///          returned vector. \n
1193///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1194///          returned vector. \n
1195///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1196///          returned vector.
1197/// \returns A 256-bit vector of [8 x float] containing the copied values.
1198#define _mm256_permute_ps(A, C) __extension__ ({ \
1199  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1200                                  (__v8sf)_mm256_undefined_ps(), \
1201                                  0 + (((C) >> 0) & 0x3), \
1202                                  0 + (((C) >> 2) & 0x3), \
1203                                  0 + (((C) >> 4) & 0x3), \
1204                                  0 + (((C) >> 6) & 0x3), \
1205                                  4 + (((C) >> 0) & 0x3), \
1206                                  4 + (((C) >> 2) & 0x3), \
1207                                  4 + (((C) >> 4) & 0x3), \
1208                                  4 + (((C) >> 6) & 0x3)); })
1209
1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1211///    [4 x double], as specified by the immediate integer operand.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// \code
1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1217/// \endcode
1218///
1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1220///
1221/// \param V1
1222///    A 256-bit vector of [4 x double].
1223/// \param V2
1224///    A 256-bit vector of [4 x double.
1225/// \param M
1226///    An immediate integer operand specifying how the values are to be
1227///    permuted. \n
1228///    Bits [1:0]: \n
1229///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1230///          destination. \n
1231///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1232///          destination. \n
1233///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1234///          destination. \n
1235///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1236///          destination. \n
1237///    Bits [5:4]: \n
1238///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1239///          destination. \n
1240///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1241///          destination. \n
1242///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1243///          destination. \n
1244///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1245///          destination.
1246/// \returns A 256-bit vector of [4 x double] containing the copied values.
1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1248  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1249                                           (__v4df)(__m256d)(V2), (M)); })
1250
1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1252///    [8 x float], as specified by the immediate integer operand.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// \code
1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1258/// \endcode
1259///
1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1261///
1262/// \param V1
1263///    A 256-bit vector of [8 x float].
1264/// \param V2
1265///    A 256-bit vector of [8 x float].
1266/// \param M
1267///    An immediate integer operand specifying how the values are to be
1268///    permuted. \n
1269///    Bits [1:0]: \n
1270///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1271///    destination. \n
1272///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1273///    destination. \n
1274///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1275///    destination. \n
1276///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1277///    destination. \n
1278///    Bits [5:4]: \n
1279///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1280///    destination. \n
1281///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1282///    destination. \n
1283///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1284///    destination. \n
1285///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1286///    destination.
1287/// \returns A 256-bit vector of [8 x float] containing the copied values.
1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1289  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1290                                          (__v8sf)(__m256)(V2), (M)); })
1291
1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1293///    as specified by the immediate integer operand.
1294///
1295/// \headerfile <x86intrin.h>
1296///
1297/// \code
1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1299/// \endcode
1300///
1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1302///
1303/// \param V1
1304///    A 256-bit integer vector.
1305/// \param V2
1306///    A 256-bit integer vector.
1307/// \param M
1308///    An immediate integer operand specifying how the values are to be copied.
1309///    Bits [1:0]: \n
1310///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1311///    destination. \n
1312///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1313///    destination. \n
1314///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1315///    destination. \n
1316///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1317///    destination. \n
1318///    Bits [5:4]: \n
1319///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1320///    destination. \n
1321///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1322///    destination. \n
1323///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1324///    destination. \n
1325///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1326///    destination.
1327/// \returns A 256-bit integer vector containing the copied values.
1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1329  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1330                                           (__v8si)(__m256i)(V2), (M)); })
1331
1332/* Vector Blend */
1333/// \brief Merges 64-bit double-precision data values stored in either of the
1334///    two 256-bit vectors of [4 x double], as specified by the immediate
1335///    integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1341/// \endcode
1342///
1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1344///
1345/// \param V1
1346///    A 256-bit vector of [4 x double].
1347/// \param V2
1348///    A 256-bit vector of [4 x double].
1349/// \param M
1350///    An immediate integer operand, with mask bits [3:0] specifying how the
1351///    values are to be copied. The position of the mask bit corresponds to the
1352///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1353///    element in operand \a V1 is copied to the same position in the
1354///    destination. When a mask bit is 1, the corresponding 64-bit element in
1355///    operand \a V2 is copied to the same position in the destination.
1356/// \returns A 256-bit vector of [4 x double] containing the copied values.
1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1358  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1359                                   (__v4df)(__m256d)(V2), \
1360                                   (((M) & 0x01) ? 4 : 0), \
1361                                   (((M) & 0x02) ? 5 : 1), \
1362                                   (((M) & 0x04) ? 6 : 2), \
1363                                   (((M) & 0x08) ? 7 : 3)); })
1364
1365/// \brief Merges 32-bit single-precision data values stored in either of the
1366///    two 256-bit vectors of [8 x float], as specified by the immediate
1367///    integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1376///
1377/// \param V1
1378///    A 256-bit vector of [8 x float].
1379/// \param V2
1380///    A 256-bit vector of [8 x float].
1381/// \param M
1382///    An immediate integer operand, with mask bits [7:0] specifying how the
1383///    values are to be copied. The position of the mask bit corresponds to the
1384///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1385///    element in operand \a V1 is copied to the same position in the
1386///    destination. When a mask bit is 1, the corresponding 32-bit element in
1387///    operand \a V2 is copied to the same position in the destination.
1388/// \returns A 256-bit vector of [8 x float] containing the copied values.
1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1390  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1391                                  (__v8sf)(__m256)(V2), \
1392                                  (((M) & 0x01) ?  8 : 0), \
1393                                  (((M) & 0x02) ?  9 : 1), \
1394                                  (((M) & 0x04) ? 10 : 2), \
1395                                  (((M) & 0x08) ? 11 : 3), \
1396                                  (((M) & 0x10) ? 12 : 4), \
1397                                  (((M) & 0x20) ? 13 : 5), \
1398                                  (((M) & 0x40) ? 14 : 6), \
1399                                  (((M) & 0x80) ? 15 : 7)); })
1400
1401/// \brief Merges 64-bit double-precision data values stored in either of the
1402///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403///    operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410///    A 256-bit vector of [4 x double].
1411/// \param __b
1412///    A 256-bit vector of [4 x double].
1413/// \param __c
1414///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415///    how the values are to be copied. The position of the mask bit corresponds
1416///    to the most significant bit of a copied value. When a mask bit is 0, the
1417///    corresponding 64-bit element in operand \a __a is copied to the same
1418///    position in the destination. When a mask bit is 1, the corresponding
1419///    64-bit element in operand \a __b is copied to the same position in the
1420///    destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425  return (__m256d)__builtin_ia32_blendvpd256(
1426    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// \brief Merges 32-bit single-precision data values stored in either of the
1430///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431///    operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438///    A 256-bit vector of [8 x float].
1439/// \param __b
1440///    A 256-bit vector of [8 x float].
1441/// \param __c
1442///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443///    and 31 specifying how the values are to be copied. The position of the
1444///    mask bit corresponds to the most significant bit of a copied value. When
1445///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446///    copied to the same position in the destination. When a mask bit is 1, the
1447///    corresponding 32-bit element in operand \a __b is copied to the same
1448///    position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453  return (__m256)__builtin_ia32_blendvps256(
1454    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// \brief Computes two dot products in parallel, using the lower and upper
1459///    halves of two [8 x float] vectors as input to the two computations, and
1460///    returning the two dot products in the lower and upper halves of the
1461///    [8 x float] result. The immediate integer operand controls which input
1462///    elements will contribute to the dot product, and where the final results
1463///    are returned. In general, for each dot product, the four corresponding
1464///    elements of the input vectors are multiplied; the first two and second
1465///    two products are summed, then the two sums are added to form the final
1466///    result.
1467///
1468/// \headerfile <x86intrin.h>
1469///
1470/// \code
1471/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1472/// \endcode
1473///
1474/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1475///
1476/// \param V1
1477///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1478/// \param V2
1479///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1480/// \param M
1481///    An immediate integer argument. Bits [7:4] determine which elements of
1482///    the input vectors are used, with bit [4] corresponding to the lowest
1483///    element and bit [7] corresponding to the highest element of each [4 x
1484///    float] subvector. If a bit is set, the corresponding elements from the
1485///    two input vectors are used as an input for dot product; otherwise that
1486///    input is treated as zero. Bits [3:0] determine which elements of the
1487///    result will receive a copy of the final dot product, with bit [0]
1488///    corresponding to the lowest element and bit [3] corresponding to the
1489///    highest element of each [4 x float] subvector. If a bit is set, the dot
1490///    product is returned in the corresponding element; otherwise that element
1491///    is set to zero. The bitmask is applied in the same way to each of the
1492///    two parallel dot product computations.
1493/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1494#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1495  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1496                                 (__v8sf)(__m256)(V2), (M)); })
1497
1498/* Vector shuffle */
1499/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1500///    specified by the immediate value operand. The four selected elements in
1501///    each operand are copied to the destination according to the bits
1502///    specified in the immediate operand. The selected elements from the first
1503///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
1504///    destination, and the selected elements from the second 256-bit operand
1505///    are copied to bits [127:64] and bits [255:192] of the destination. For
1506///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
1507///    the 256-bit destination vector would contain the following values: b[7],
1508///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1509///
1510/// \headerfile <x86intrin.h>
1511///
1512/// \code
1513/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1514/// \endcode
1515///
1516/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1517///
1518/// \param a
1519///    A 256-bit vector of [8 x float]. The four selected elements in this
1520///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1521///    according to the bits specified in the immediate operand.
1522/// \param b
1523///    A 256-bit vector of [8 x float]. The four selected elements in this
1524///    operand are copied to bits [127:64] and bits [255:192] in the
1525///    destination, according to the bits specified in the immediate operand.
1526/// \param mask
1527///    An immediate value containing an 8-bit value specifying which elements to
1528///    copy from \a a and \a b \n.
1529///    Bits [3:0] specify the values copied from operand \a a. \n
1530///    Bits [7:4] specify the values copied from operand \a b. \n
1531///    The destinations within the 256-bit destination are assigned values as
1532///    follows, according to the bit value assignments described below: \n
1533///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1534///    destination. \n
1535///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1536///    destination. \n
1537///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1538///    destination. \n
1539///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1540///    the destination. \n
1541///    Bit value assignments: \n
1542///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1543///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1544///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1545///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1546/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1547#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1548  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1549                                  (__v8sf)(__m256)(b), \
1550                                  0  + (((mask) >> 0) & 0x3), \
1551                                  0  + (((mask) >> 2) & 0x3), \
1552                                  8  + (((mask) >> 4) & 0x3), \
1553                                  8  + (((mask) >> 6) & 0x3), \
1554                                  4  + (((mask) >> 0) & 0x3), \
1555                                  4  + (((mask) >> 2) & 0x3), \
1556                                  12 + (((mask) >> 4) & 0x3), \
1557                                  12 + (((mask) >> 6) & 0x3)); })
1558
1559/// \brief Selects four double-precision values from the 256-bit operands of
1560///    [4 x double], as specified by the immediate value operand. The selected
1561///    elements from the first 256-bit operand are copied to bits [63:0] and
1562///    bits [191:128] in the destination, and the selected elements from the
1563///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
1564///    the destination. For example, if bits [3:0] of the immediate operand
1565///    contain a value of 0xF, the 256-bit destination vector would contain the
1566///    following values: b[3], a[3], b[1], a[1].
1567///
1568/// \headerfile <x86intrin.h>
1569///
1570/// \code
1571/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1572/// \endcode
1573///
1574/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1575///
1576/// \param a
1577///    A 256-bit vector of [4 x double].
1578/// \param b
1579///    A 256-bit vector of [4 x double].
1580/// \param mask
1581///    An immediate value containing 8-bit values specifying which elements to
1582///    copy from \a a and \a b: \n
1583///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1584///    destination. \n
1585///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1586///    destination. \n
1587///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1588///    destination. \n
1589///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1590///    destination. \n
1591///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1592///    destination. \n
1593///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1594///    destination. \n
1595///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1596///    destination. \n
1597///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1598///    destination.
1599/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1600#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1601  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1602                                   (__v4df)(__m256d)(b), \
1603                                   0 + (((mask) >> 0) & 0x1), \
1604                                   4 + (((mask) >> 1) & 0x1), \
1605                                   2 + (((mask) >> 2) & 0x1), \
1606                                   6 + (((mask) >> 3) & 0x1)); })
1607
1608/* Compare */
1609#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1610#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1611#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1612#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1613#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1614#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1615#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1616#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
1617#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1618#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
1619#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1620#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1621#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1622#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1623#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1624#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1625#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1626#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1627#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1628#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1629#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1630#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1631#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
1632#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1633#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1634#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
1635#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1636#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1637#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1638#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1639#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1640#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1641
1642/// \brief Compares each of the corresponding double-precision values of two
1643///    128-bit vectors of [2 x double], using the operation specified by the
1644///    immediate integer operand. Returns a [2 x double] vector consisting of
1645///    two doubles corresponding to the two comparison results: zero if the
1646///    comparison is false, and all 1's if the comparison is true.
1647///
1648/// \headerfile <x86intrin.h>
1649///
1650/// \code
1651/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1652/// \endcode
1653///
1654/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1655///
1656/// \param a
1657///    A 128-bit vector of [2 x double].
1658/// \param b
1659///    A 128-bit vector of [2 x double].
1660/// \param c
1661///    An immediate integer operand, with bits [4:0] specifying which comparison
1662///    operation to use: \n
1663///    0x00 : Equal (ordered, non-signaling)
1664///    0x01 : Less-than (ordered, signaling)
1665///    0x02 : Less-than-or-equal (ordered, signaling)
1666///    0x03 : Unordered (non-signaling)
1667///    0x04 : Not-equal (unordered, non-signaling)
1668///    0x05 : Not-less-than (unordered, signaling)
1669///    0x06 : Not-less-than-or-equal (unordered, signaling)
1670///    0x07 : Ordered (non-signaling)
1671///    0x08 : Equal (unordered, non-signaling)
1672///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1673///    0x0a : Not-greater-than (unordered, signaling)
1674///    0x0b : False (ordered, non-signaling)
1675///    0x0c : Not-equal (ordered, non-signaling)
1676///    0x0d : Greater-than-or-equal (ordered, signaling)
1677///    0x0e : Greater-than (ordered, signaling)
1678///    0x0f : True (unordered, non-signaling)
1679///    0x10 : Equal (ordered, signaling)
1680///    0x11 : Less-than (ordered, non-signaling)
1681///    0x12 : Less-than-or-equal (ordered, non-signaling)
1682///    0x13 : Unordered (signaling)
1683///    0x14 : Not-equal (unordered, signaling)
1684///    0x15 : Not-less-than (unordered, non-signaling)
1685///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1686///    0x17 : Ordered (signaling)
1687///    0x18 : Equal (unordered, signaling)
1688///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1689///    0x1a : Not-greater-than (unordered, non-signaling)
1690///    0x1b : False (ordered, signaling)
1691///    0x1c : Not-equal (ordered, signaling)
1692///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1693///    0x1e : Greater-than (ordered, non-signaling)
1694///    0x1f : True (unordered, signaling)
1695/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1696#define _mm_cmp_pd(a, b, c) __extension__ ({ \
1697  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1698                                (__v2df)(__m128d)(b), (c)); })
1699
1700/// \brief Compares each of the corresponding values of two 128-bit vectors of
1701///    [4 x float], using the operation specified by the immediate integer
1702///    operand. Returns a [4 x float] vector consisting of four floats
1703///    corresponding to the four comparison results: zero if the comparison is
1704///    false, and all 1's if the comparison is true.
1705///
1706/// \headerfile <x86intrin.h>
1707///
1708/// \code
1709/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1710/// \endcode
1711///
1712/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1713///
1714/// \param a
1715///    A 128-bit vector of [4 x float].
1716/// \param b
1717///    A 128-bit vector of [4 x float].
1718/// \param c
1719///    An immediate integer operand, with bits [4:0] specifying which comparison
1720///    operation to use: \n
1721///    0x00 : Equal (ordered, non-signaling)
1722///    0x01 : Less-than (ordered, signaling)
1723///    0x02 : Less-than-or-equal (ordered, signaling)
1724///    0x03 : Unordered (non-signaling)
1725///    0x04 : Not-equal (unordered, non-signaling)
1726///    0x05 : Not-less-than (unordered, signaling)
1727///    0x06 : Not-less-than-or-equal (unordered, signaling)
1728///    0x07 : Ordered (non-signaling)
1729///    0x08 : Equal (unordered, non-signaling)
1730///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1731///    0x0a : Not-greater-than (unordered, signaling)
1732///    0x0b : False (ordered, non-signaling)
1733///    0x0c : Not-equal (ordered, non-signaling)
1734///    0x0d : Greater-than-or-equal (ordered, signaling)
1735///    0x0e : Greater-than (ordered, signaling)
1736///    0x0f : True (unordered, non-signaling)
1737///    0x10 : Equal (ordered, signaling)
1738///    0x11 : Less-than (ordered, non-signaling)
1739///    0x12 : Less-than-or-equal (ordered, non-signaling)
1740///    0x13 : Unordered (signaling)
1741///    0x14 : Not-equal (unordered, signaling)
1742///    0x15 : Not-less-than (unordered, non-signaling)
1743///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1744///    0x17 : Ordered (signaling)
1745///    0x18 : Equal (unordered, signaling)
1746///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1747///    0x1a : Not-greater-than (unordered, non-signaling)
1748///    0x1b : False (ordered, signaling)
1749///    0x1c : Not-equal (ordered, signaling)
1750///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1751///    0x1e : Greater-than (ordered, non-signaling)
1752///    0x1f : True (unordered, signaling)
1753/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1754#define _mm_cmp_ps(a, b, c) __extension__ ({ \
1755  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1756                               (__v4sf)(__m128)(b), (c)); })
1757
1758/// \brief Compares each of the corresponding double-precision values of two
1759///    256-bit vectors of [4 x double], using the operation specified by the
1760///    immediate integer operand. Returns a [4 x double] vector consisting of
1761///    four doubles corresponding to the four comparison results: zero if the
1762///    comparison is false, and all 1's if the comparison is true.
1763///
1764/// \headerfile <x86intrin.h>
1765///
1766/// \code
1767/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1768/// \endcode
1769///
1770/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1771///
1772/// \param a
1773///    A 256-bit vector of [4 x double].
1774/// \param b
1775///    A 256-bit vector of [4 x double].
1776/// \param c
1777///    An immediate integer operand, with bits [4:0] specifying which comparison
1778///    operation to use: \n
1779///    0x00 : Equal (ordered, non-signaling)
1780///    0x01 : Less-than (ordered, signaling)
1781///    0x02 : Less-than-or-equal (ordered, signaling)
1782///    0x03 : Unordered (non-signaling)
1783///    0x04 : Not-equal (unordered, non-signaling)
1784///    0x05 : Not-less-than (unordered, signaling)
1785///    0x06 : Not-less-than-or-equal (unordered, signaling)
1786///    0x07 : Ordered (non-signaling)
1787///    0x08 : Equal (unordered, non-signaling)
1788///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1789///    0x0a : Not-greater-than (unordered, signaling)
1790///    0x0b : False (ordered, non-signaling)
1791///    0x0c : Not-equal (ordered, non-signaling)
1792///    0x0d : Greater-than-or-equal (ordered, signaling)
1793///    0x0e : Greater-than (ordered, signaling)
1794///    0x0f : True (unordered, non-signaling)
1795///    0x10 : Equal (ordered, signaling)
1796///    0x11 : Less-than (ordered, non-signaling)
1797///    0x12 : Less-than-or-equal (ordered, non-signaling)
1798///    0x13 : Unordered (signaling)
1799///    0x14 : Not-equal (unordered, signaling)
1800///    0x15 : Not-less-than (unordered, non-signaling)
1801///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1802///    0x17 : Ordered (signaling)
1803///    0x18 : Equal (unordered, signaling)
1804///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1805///    0x1a : Not-greater-than (unordered, non-signaling)
1806///    0x1b : False (ordered, signaling)
1807///    0x1c : Not-equal (ordered, signaling)
1808///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1809///    0x1e : Greater-than (ordered, non-signaling)
1810///    0x1f : True (unordered, signaling)
1811/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1812#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1813  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1814                                   (__v4df)(__m256d)(b), (c)); })
1815
1816/// \brief Compares each of the corresponding values of two 256-bit vectors of
1817///    [8 x float], using the operation specified by the immediate integer
1818///    operand. Returns a [8 x float] vector consisting of eight floats
1819///    corresponding to the eight comparison results: zero if the comparison is
1820///    false, and all 1's if the comparison is true.
1821///
1822/// \headerfile <x86intrin.h>
1823///
1824/// \code
1825/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1826/// \endcode
1827///
1828/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1829///
1830/// \param a
1831///    A 256-bit vector of [8 x float].
1832/// \param b
1833///    A 256-bit vector of [8 x float].
1834/// \param c
1835///    An immediate integer operand, with bits [4:0] specifying which comparison
1836///    operation to use: \n
1837///    0x00 : Equal (ordered, non-signaling)
1838///    0x01 : Less-than (ordered, signaling)
1839///    0x02 : Less-than-or-equal (ordered, signaling)
1840///    0x03 : Unordered (non-signaling)
1841///    0x04 : Not-equal (unordered, non-signaling)
1842///    0x05 : Not-less-than (unordered, signaling)
1843///    0x06 : Not-less-than-or-equal (unordered, signaling)
1844///    0x07 : Ordered (non-signaling)
1845///    0x08 : Equal (unordered, non-signaling)
1846///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1847///    0x0a : Not-greater-than (unordered, signaling)
1848///    0x0b : False (ordered, non-signaling)
1849///    0x0c : Not-equal (ordered, non-signaling)
1850///    0x0d : Greater-than-or-equal (ordered, signaling)
1851///    0x0e : Greater-than (ordered, signaling)
1852///    0x0f : True (unordered, non-signaling)
1853///    0x10 : Equal (ordered, signaling)
1854///    0x11 : Less-than (ordered, non-signaling)
1855///    0x12 : Less-than-or-equal (ordered, non-signaling)
1856///    0x13 : Unordered (signaling)
1857///    0x14 : Not-equal (unordered, signaling)
1858///    0x15 : Not-less-than (unordered, non-signaling)
1859///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1860///    0x17 : Ordered (signaling)
1861///    0x18 : Equal (unordered, signaling)
1862///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1863///    0x1a : Not-greater-than (unordered, non-signaling)
1864///    0x1b : False (ordered, signaling)
1865///    0x1c : Not-equal (ordered, signaling)
1866///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1867///    0x1e : Greater-than (ordered, non-signaling)
1868///    0x1f : True (unordered, signaling)
1869/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1870#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1871  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1872                                  (__v8sf)(__m256)(b), (c)); })
1873
1874/// \brief Compares each of the corresponding scalar double-precision values of
1875///    two 128-bit vectors of [2 x double], using the operation specified by the
1876///    immediate integer operand. If the result is true, all 64 bits of the
1877///    destination vector are set; otherwise they are cleared.
1878///
1879/// \headerfile <x86intrin.h>
1880///
1881/// \code
1882/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1883/// \endcode
1884///
1885/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1886///
1887/// \param a
1888///    A 128-bit vector of [2 x double].
1889/// \param b
1890///    A 128-bit vector of [2 x double].
1891/// \param c
1892///    An immediate integer operand, with bits [4:0] specifying which comparison
1893///    operation to use: \n
1894///    0x00 : Equal (ordered, non-signaling)
1895///    0x01 : Less-than (ordered, signaling)
1896///    0x02 : Less-than-or-equal (ordered, signaling)
1897///    0x03 : Unordered (non-signaling)
1898///    0x04 : Not-equal (unordered, non-signaling)
1899///    0x05 : Not-less-than (unordered, signaling)
1900///    0x06 : Not-less-than-or-equal (unordered, signaling)
1901///    0x07 : Ordered (non-signaling)
1902///    0x08 : Equal (unordered, non-signaling)
1903///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1904///    0x0a : Not-greater-than (unordered, signaling)
1905///    0x0b : False (ordered, non-signaling)
1906///    0x0c : Not-equal (ordered, non-signaling)
1907///    0x0d : Greater-than-or-equal (ordered, signaling)
1908///    0x0e : Greater-than (ordered, signaling)
1909///    0x0f : True (unordered, non-signaling)
1910///    0x10 : Equal (ordered, signaling)
1911///    0x11 : Less-than (ordered, non-signaling)
1912///    0x12 : Less-than-or-equal (ordered, non-signaling)
1913///    0x13 : Unordered (signaling)
1914///    0x14 : Not-equal (unordered, signaling)
1915///    0x15 : Not-less-than (unordered, non-signaling)
1916///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1917///    0x17 : Ordered (signaling)
1918///    0x18 : Equal (unordered, signaling)
1919///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1920///    0x1a : Not-greater-than (unordered, non-signaling)
1921///    0x1b : False (ordered, signaling)
1922///    0x1c : Not-equal (ordered, signaling)
1923///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1924///    0x1e : Greater-than (ordered, non-signaling)
1925///    0x1f : True (unordered, signaling)
1926/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1927#define _mm_cmp_sd(a, b, c) __extension__ ({ \
1928  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1929                                (__v2df)(__m128d)(b), (c)); })
1930
1931/// \brief Compares each of the corresponding scalar values of two 128-bit
1932///    vectors of [4 x float], using the operation specified by the immediate
1933///    integer operand. If the result is true, all 32 bits of the destination
1934///    vector are set; otherwise they are cleared.
1935///
1936/// \headerfile <x86intrin.h>
1937///
1938/// \code
1939/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1940/// \endcode
1941///
1942/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1943///
1944/// \param a
1945///    A 128-bit vector of [4 x float].
1946/// \param b
1947///    A 128-bit vector of [4 x float].
1948/// \param c
1949///    An immediate integer operand, with bits [4:0] specifying which comparison
1950///    operation to use: \n
1951///    0x00 : Equal (ordered, non-signaling)
1952///    0x01 : Less-than (ordered, signaling)
1953///    0x02 : Less-than-or-equal (ordered, signaling)
1954///    0x03 : Unordered (non-signaling)
1955///    0x04 : Not-equal (unordered, non-signaling)
1956///    0x05 : Not-less-than (unordered, signaling)
1957///    0x06 : Not-less-than-or-equal (unordered, signaling)
1958///    0x07 : Ordered (non-signaling)
1959///    0x08 : Equal (unordered, non-signaling)
1960///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1961///    0x0a : Not-greater-than (unordered, signaling)
1962///    0x0b : False (ordered, non-signaling)
1963///    0x0c : Not-equal (ordered, non-signaling)
1964///    0x0d : Greater-than-or-equal (ordered, signaling)
1965///    0x0e : Greater-than (ordered, signaling)
1966///    0x0f : True (unordered, non-signaling)
1967///    0x10 : Equal (ordered, signaling)
1968///    0x11 : Less-than (ordered, non-signaling)
1969///    0x12 : Less-than-or-equal (ordered, non-signaling)
1970///    0x13 : Unordered (signaling)
1971///    0x14 : Not-equal (unordered, signaling)
1972///    0x15 : Not-less-than (unordered, non-signaling)
1973///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1974///    0x17 : Ordered (signaling)
1975///    0x18 : Equal (unordered, signaling)
1976///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1977///    0x1a : Not-greater-than (unordered, non-signaling)
1978///    0x1b : False (ordered, signaling)
1979///    0x1c : Not-equal (ordered, signaling)
1980///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1981///    0x1e : Greater-than (ordered, non-signaling)
1982///    0x1f : True (unordered, signaling)
1983/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1984#define _mm_cmp_ss(a, b, c) __extension__ ({ \
1985  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1986                               (__v4sf)(__m128)(b), (c)); })
1987
1988/// \brief Takes a [8 x i32] vector and returns the vector element value
1989///    indexed by the immediate constant operand.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1994///   instruction.
1995///
1996/// \param __a
1997///    A 256-bit vector of [8 x i32].
1998/// \param __imm
1999///    An immediate integer operand with bits [2:0] determining which vector
2000///    element is extracted and returned.
2001/// \returns A 32-bit integer containing the extracted 32 bits of extended
2002///    packed data.
2003static __inline int __DEFAULT_FN_ATTRS
2004_mm256_extract_epi32(__m256i __a, const int __imm)
2005{
2006  __v8si __b = (__v8si)__a;
2007  return __b[__imm & 7];
2008}
2009
2010/// \brief Takes a [16 x i16] vector and returns the vector element value
2011///    indexed by the immediate constant operand.
2012///
2013/// \headerfile <x86intrin.h>
2014///
2015/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2016///   instruction.
2017///
2018/// \param __a
2019///    A 256-bit integer vector of [16 x i16].
2020/// \param __imm
2021///    An immediate integer operand with bits [3:0] determining which vector
2022///    element is extracted and returned.
2023/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2024///    packed data.
2025static __inline int __DEFAULT_FN_ATTRS
2026_mm256_extract_epi16(__m256i __a, const int __imm)
2027{
2028  __v16hi __b = (__v16hi)__a;
2029  return (unsigned short)__b[__imm & 15];
2030}
2031
2032/// \brief Takes a [32 x i8] vector and returns the vector element value
2033///    indexed by the immediate constant operand.
2034///
2035/// \headerfile <x86intrin.h>
2036///
2037/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2038///   instruction.
2039///
2040/// \param __a
2041///    A 256-bit integer vector of [32 x i8].
2042/// \param __imm
2043///    An immediate integer operand with bits [4:0] determining which vector
2044///    element is extracted and returned.
2045/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2046///    packed data.
2047static __inline int __DEFAULT_FN_ATTRS
2048_mm256_extract_epi8(__m256i __a, const int __imm)
2049{
2050  __v32qi __b = (__v32qi)__a;
2051  return (unsigned char)__b[__imm & 31];
2052}
2053
2054#ifdef __x86_64__
2055/// \brief Takes a [4 x i64] vector and returns the vector element value
2056///    indexed by the immediate constant operand.
2057///
2058/// \headerfile <x86intrin.h>
2059///
2060/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2061///   instruction.
2062///
2063/// \param __a
2064///    A 256-bit integer vector of [4 x i64].
2065/// \param __imm
2066///    An immediate integer operand with bits [1:0] determining which vector
2067///    element is extracted and returned.
2068/// \returns A 64-bit integer containing the extracted 64 bits of extended
2069///    packed data.
2070static __inline long long  __DEFAULT_FN_ATTRS
2071_mm256_extract_epi64(__m256i __a, const int __imm)
2072{
2073  __v4di __b = (__v4di)__a;
2074  return __b[__imm & 3];
2075}
2076#endif
2077
2078/// \brief Takes a [8 x i32] vector and replaces the vector element value
2079///    indexed by the immediate constant operand by a new value. Returns the
2080///    modified vector.
2081///
2082/// \headerfile <x86intrin.h>
2083///
2084/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2085///   instruction.
2086///
2087/// \param __a
2088///    A vector of [8 x i32] to be used by the insert operation.
2089/// \param __b
2090///    An integer value. The replacement value for the insert operation.
2091/// \param __imm
2092///    An immediate integer specifying the index of the vector element to be
2093///    replaced.
2094/// \returns A copy of vector \a __a, after replacing its element indexed by
2095///    \a __imm with \a __b.
2096static __inline __m256i __DEFAULT_FN_ATTRS
2097_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
2098{
2099  __v8si __c = (__v8si)__a;
2100  __c[__imm & 7] = __b;
2101  return (__m256i)__c;
2102}
2103
2104
2105/// \brief Takes a [16 x i16] vector and replaces the vector element value
2106///    indexed by the immediate constant operand with a new value. Returns the
2107///    modified vector.
2108///
2109/// \headerfile <x86intrin.h>
2110///
2111/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2112///   instruction.
2113///
2114/// \param __a
2115///    A vector of [16 x i16] to be used by the insert operation.
2116/// \param __b
2117///    An i16 integer value. The replacement value for the insert operation.
2118/// \param __imm
2119///    An immediate integer specifying the index of the vector element to be
2120///    replaced.
2121/// \returns A copy of vector \a __a, after replacing its element indexed by
2122///    \a __imm with \a __b.
2123static __inline __m256i __DEFAULT_FN_ATTRS
2124_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
2125{
2126  __v16hi __c = (__v16hi)__a;
2127  __c[__imm & 15] = __b;
2128  return (__m256i)__c;
2129}
2130
2131/// \brief Takes a [32 x i8] vector and replaces the vector element value
2132///    indexed by the immediate constant operand with a new value. Returns the
2133///    modified vector.
2134///
2135/// \headerfile <x86intrin.h>
2136///
2137/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2138///   instruction.
2139///
2140/// \param __a
2141///    A vector of [32 x i8] to be used by the insert operation.
2142/// \param __b
2143///    An i8 integer value. The replacement value for the insert operation.
2144/// \param __imm
2145///    An immediate integer specifying the index of the vector element to be
2146///    replaced.
2147/// \returns A copy of vector \a __a, after replacing its element indexed by
2148///    \a __imm with \a __b.
2149static __inline __m256i __DEFAULT_FN_ATTRS
2150_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
2151{
2152  __v32qi __c = (__v32qi)__a;
2153  __c[__imm & 31] = __b;
2154  return (__m256i)__c;
2155}
2156
2157#ifdef __x86_64__
2158/// \brief Takes a [4 x i64] vector and replaces the vector element value
2159///    indexed by the immediate constant operand with a new value. Returns the
2160///    modified vector.
2161///
2162/// \headerfile <x86intrin.h>
2163///
2164/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2165///   instruction.
2166///
2167/// \param __a
2168///    A vector of [4 x i64] to be used by the insert operation.
2169/// \param __b
2170///    A 64-bit integer value. The replacement value for the insert operation.
2171/// \param __imm
2172///    An immediate integer specifying the index of the vector element to be
2173///    replaced.
2174/// \returns A copy of vector \a __a, after replacing its element indexed by
2175///     \a __imm with \a __b.
2176static __inline __m256i __DEFAULT_FN_ATTRS
2177_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
2178{
2179  __v4di __c = (__v4di)__a;
2180  __c[__imm & 3] = __b;
2181  return (__m256i)__c;
2182}
2183#endif
2184
2185/* Conversion */
2186/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2187///
2188/// \headerfile <x86intrin.h>
2189///
2190/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2191///
2192/// \param __a
2193///    A 128-bit integer vector of [4 x i32].
2194/// \returns A 256-bit vector of [4 x double] containing the converted values.
2195static __inline __m256d __DEFAULT_FN_ATTRS
2196_mm256_cvtepi32_pd(__m128i __a)
2197{
2198  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2199}
2200
2201/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2202///
2203/// \headerfile <x86intrin.h>
2204///
2205/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2206///
2207/// \param __a
2208///    A 256-bit integer vector.
2209/// \returns A 256-bit vector of [8 x float] containing the converted values.
2210static __inline __m256 __DEFAULT_FN_ATTRS
2211_mm256_cvtepi32_ps(__m256i __a)
2212{
2213  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
2214}
2215
2216/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2217///    [4 x float].
2218///
2219/// \headerfile <x86intrin.h>
2220///
2221/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2222///
2223/// \param __a
2224///    A 256-bit vector of [4 x double].
2225/// \returns A 128-bit vector of [4 x float] containing the converted values.
2226static __inline __m128 __DEFAULT_FN_ATTRS
2227_mm256_cvtpd_ps(__m256d __a)
2228{
2229  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2230}
2231
2232/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2233///
2234/// \headerfile <x86intrin.h>
2235///
2236/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2237///
2238/// \param __a
2239///    A 256-bit vector of [8 x float].
2240/// \returns A 256-bit integer vector containing the converted values.
2241static __inline __m256i __DEFAULT_FN_ATTRS
2242_mm256_cvtps_epi32(__m256 __a)
2243{
2244  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2245}
2246
2247/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2248///    x double].
2249///
2250/// \headerfile <x86intrin.h>
2251///
2252/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2253///
2254/// \param __a
2255///    A 128-bit vector of [4 x float].
2256/// \returns A 256-bit vector of [4 x double] containing the converted values.
2257static __inline __m256d __DEFAULT_FN_ATTRS
2258_mm256_cvtps_pd(__m128 __a)
2259{
2260  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2261}
2262
2263/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2264///    x i32], truncating the result by rounding towards zero when it is
2265///    inexact.
2266///
2267/// \headerfile <x86intrin.h>
2268///
2269/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2270///
2271/// \param __a
2272///    A 256-bit vector of [4 x double].
2273/// \returns A 128-bit integer vector containing the converted values.
2274static __inline __m128i __DEFAULT_FN_ATTRS
2275_mm256_cvttpd_epi32(__m256d __a)
2276{
2277  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2278}
2279
2280/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2281///    x i32]. When a conversion is inexact, the value returned is rounded
2282///    according to the rounding control bits in the MXCSR register.
2283///
2284/// \headerfile <x86intrin.h>
2285///
2286/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2287///
2288/// \param __a
2289///    A 256-bit vector of [4 x double].
2290/// \returns A 128-bit integer vector containing the converted values.
2291static __inline __m128i __DEFAULT_FN_ATTRS
2292_mm256_cvtpd_epi32(__m256d __a)
2293{
2294  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2295}
2296
2297/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2298///    truncating the result by rounding towards zero when it is inexact.
2299///
2300/// \headerfile <x86intrin.h>
2301///
2302/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2303///
2304/// \param __a
2305///    A 256-bit vector of [8 x float].
2306/// \returns A 256-bit integer vector containing the converted values.
2307static __inline __m256i __DEFAULT_FN_ATTRS
2308_mm256_cvttps_epi32(__m256 __a)
2309{
2310  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2311}
2312
2313/// \brief Returns the first element of the input vector of [4 x double].
2314///
2315/// \headerfile <avxintrin.h>
2316///
2317/// This intrinsic is a utility function and does not correspond to a specific
2318///    instruction.
2319///
2320/// \param __a
2321///    A 256-bit vector of [4 x double].
2322/// \returns A 64 bit double containing the first element of the input vector.
2323static __inline double __DEFAULT_FN_ATTRS
2324_mm256_cvtsd_f64(__m256d __a)
2325{
2326 return __a[0];
2327}
2328
2329/// \brief Returns the first element of the input vector of [8 x i32].
2330///
2331/// \headerfile <avxintrin.h>
2332///
2333/// This intrinsic is a utility function and does not correspond to a specific
2334///    instruction.
2335///
2336/// \param __a
2337///    A 256-bit vector of [8 x i32].
2338/// \returns A 32 bit integer containing the first element of the input vector.
2339static __inline int __DEFAULT_FN_ATTRS
2340_mm256_cvtsi256_si32(__m256i __a)
2341{
2342 __v8si __b = (__v8si)__a;
2343 return __b[0];
2344}
2345
2346/// \brief Returns the first element of the input vector of [8 x float].
2347///
2348/// \headerfile <avxintrin.h>
2349///
2350/// This intrinsic is a utility function and does not correspond to a specific
2351///    instruction.
2352///
2353/// \param __a
2354///    A 256-bit vector of [8 x float].
2355/// \returns A 32 bit float containing the first element of the input vector.
2356static __inline float __DEFAULT_FN_ATTRS
2357_mm256_cvtss_f32(__m256 __a)
2358{
2359 return __a[0];
2360}
2361
2362/* Vector replicate */
2363/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
2364///    vector of [8 x float] to float values in a 256-bit vector of
2365///    [8 x float].
2366///
2367/// \headerfile <x86intrin.h>
2368///
2369/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2370///
2371/// \param __a
2372///    A 256-bit vector of [8 x float]. \n
2373///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2374///    the return value. \n
2375///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2376///    the return value. \n
2377///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2378///    return value. \n
2379///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2380///    return value.
2381/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2382///    values.
2383static __inline __m256 __DEFAULT_FN_ATTRS
2384_mm256_movehdup_ps(__m256 __a)
2385{
2386  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2387}
2388
2389/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
2390///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2391///
2392/// \headerfile <x86intrin.h>
2393///
2394/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2395///
2396/// \param __a
2397///    A 256-bit vector of [8 x float]. \n
2398///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2399///    the return value. \n
2400///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2401///    the return value. \n
2402///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2403///    return value. \n
2404///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2405///    return value.
2406/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2407///    values.
2408static __inline __m256 __DEFAULT_FN_ATTRS
2409_mm256_moveldup_ps(__m256 __a)
2410{
2411  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2412}
2413
2414/// \brief Moves and duplicates double-precision floating point values from a
2415///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2416///    vector of [4 x double].
2417///
2418/// \headerfile <x86intrin.h>
2419///
2420/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2421///
2422/// \param __a
2423///    A 256-bit vector of [4 x double]. \n
2424///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2425///    return value. \n
2426///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2427///    the return value.
2428/// \returns A 256-bit vector of [4 x double] containing the moved and
2429///    duplicated values.
2430static __inline __m256d __DEFAULT_FN_ATTRS
2431_mm256_movedup_pd(__m256d __a)
2432{
2433  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2434}
2435
2436/* Unpack and Interleave */
2437/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2438///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2439///
2440/// \headerfile <x86intrin.h>
2441///
2442/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2443///
2444/// \param __a
2445///    A 256-bit floating-point vector of [4 x double]. \n
2446///    Bits [127:64] are written to bits [63:0] of the return value. \n
2447///    Bits [255:192] are written to bits [191:128] of the return value. \n
2448/// \param __b
2449///    A 256-bit floating-point vector of [4 x double]. \n
2450///    Bits [127:64] are written to bits [127:64] of the return value. \n
2451///    Bits [255:192] are written to bits [255:192] of the return value. \n
2452/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2453static __inline __m256d __DEFAULT_FN_ATTRS
2454_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2455{
2456  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2457}
2458
2459/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2460///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2461///
2462/// \headerfile <x86intrin.h>
2463///
2464/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2465///
2466/// \param __a
2467///    A 256-bit floating-point vector of [4 x double]. \n
2468///    Bits [63:0] are written to bits [63:0] of the return value. \n
2469///    Bits [191:128] are written to bits [191:128] of the return value.
2470/// \param __b
2471///    A 256-bit floating-point vector of [4 x double]. \n
2472///    Bits [63:0] are written to bits [127:64] of the return value. \n
2473///    Bits [191:128] are written to bits [255:192] of the return value. \n
2474/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2475static __inline __m256d __DEFAULT_FN_ATTRS
2476_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2477{
2478  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2479}
2480
2481/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2482///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2483///    vector of [8 x float].
2484///
2485/// \headerfile <x86intrin.h>
2486///
2487/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2488///
2489/// \param __a
2490///    A 256-bit vector of [8 x float]. \n
2491///    Bits [95:64] are written to bits [31:0] of the return value. \n
2492///    Bits [127:96] are written to bits [95:64] of the return value. \n
2493///    Bits [223:192] are written to bits [159:128] of the return value. \n
2494///    Bits [255:224] are written to bits [223:192] of the return value.
2495/// \param __b
2496///    A 256-bit vector of [8 x float]. \n
2497///    Bits [95:64] are written to bits [63:32] of the return value. \n
2498///    Bits [127:96] are written to bits [127:96] of the return value. \n
2499///    Bits [223:192] are written to bits [191:160] of the return value. \n
2500///    Bits [255:224] are written to bits [255:224] of the return value.
2501/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2502static __inline __m256 __DEFAULT_FN_ATTRS
2503_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2504{
2505  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2506}
2507
2508/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2509///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2510///    vector of [8 x float].
2511///
2512/// \headerfile <x86intrin.h>
2513///
2514/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2515///
2516/// \param __a
2517///    A 256-bit vector of [8 x float]. \n
2518///    Bits [31:0] are written to bits [31:0] of the return value. \n
2519///    Bits [63:32] are written to bits [95:64] of the return value. \n
2520///    Bits [159:128] are written to bits [159:128] of the return value. \n
2521///    Bits [191:160] are written to bits [223:192] of the return value.
2522/// \param __b
2523///    A 256-bit vector of [8 x float]. \n
2524///    Bits [31:0] are written to bits [63:32] of the return value. \n
2525///    Bits [63:32] are written to bits [127:96] of the return value. \n
2526///    Bits [159:128] are written to bits [191:160] of the return value. \n
2527///    Bits [191:160] are written to bits [255:224] of the return value.
2528/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2529static __inline __m256 __DEFAULT_FN_ATTRS
2530_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2531{
2532  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2533}
2534
2535/* Bit Test */
2536/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2537///    element-by-element comparison of the double-precision element in the
2538///    first source vector and the corresponding element in the second source
2539///    vector. The EFLAGS register is updated as follows: \n
2540///    If there is at least one pair of double-precision elements where the
2541///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2542///    ZF flag is set to 1. \n
2543///    If there is at least one pair of double-precision elements where the
2544///    sign-bit of the first element is 0 and the sign-bit of the second element
2545///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2546///    This intrinsic returns the value of the ZF flag.
2547///
2548/// \headerfile <x86intrin.h>
2549///
2550/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2551///
2552/// \param __a
2553///    A 128-bit vector of [2 x double].
2554/// \param __b
2555///    A 128-bit vector of [2 x double].
2556/// \returns the ZF flag in the EFLAGS register.
2557static __inline int __DEFAULT_FN_ATTRS
2558_mm_testz_pd(__m128d __a, __m128d __b)
2559{
2560  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2561}
2562
2563/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2564///    element-by-element comparison of the double-precision element in the
2565///    first source vector and the corresponding element in the second source
2566///    vector. The EFLAGS register is updated as follows: \n
2567///    If there is at least one pair of double-precision elements where the
2568///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2569///    ZF flag is set to 1. \n
2570///    If there is at least one pair of double-precision elements where the
2571///    sign-bit of the first element is 0 and the sign-bit of the second element
2572///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2573///    This intrinsic returns the value of the CF flag.
2574///
2575/// \headerfile <x86intrin.h>
2576///
2577/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2578///
2579/// \param __a
2580///    A 128-bit vector of [2 x double].
2581/// \param __b
2582///    A 128-bit vector of [2 x double].
2583/// \returns the CF flag in the EFLAGS register.
2584static __inline int __DEFAULT_FN_ATTRS
2585_mm_testc_pd(__m128d __a, __m128d __b)
2586{
2587  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2588}
2589
2590/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2591///    element-by-element comparison of the double-precision element in the
2592///    first source vector and the corresponding element in the second source
2593///    vector. The EFLAGS register is updated as follows: \n
2594///    If there is at least one pair of double-precision elements where the
2595///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2596///    ZF flag is set to 1. \n
2597///    If there is at least one pair of double-precision elements where the
2598///    sign-bit of the first element is 0 and the sign-bit of the second element
2599///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2600///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2601///    otherwise it returns 0.
2602///
2603/// \headerfile <x86intrin.h>
2604///
2605/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2606///
2607/// \param __a
2608///    A 128-bit vector of [2 x double].
2609/// \param __b
2610///    A 128-bit vector of [2 x double].
2611/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2612static __inline int __DEFAULT_FN_ATTRS
2613_mm_testnzc_pd(__m128d __a, __m128d __b)
2614{
2615  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2616}
2617
2618/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2619///    element-by-element comparison of the single-precision element in the
2620///    first source vector and the corresponding element in the second source
2621///    vector. The EFLAGS register is updated as follows: \n
2622///    If there is at least one pair of single-precision elements where the
2623///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2624///    ZF flag is set to 1. \n
2625///    If there is at least one pair of single-precision elements where the
2626///    sign-bit of the first element is 0 and the sign-bit of the second element
2627///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2628///    This intrinsic returns the value of the ZF flag.
2629///
2630/// \headerfile <x86intrin.h>
2631///
2632/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2633///
2634/// \param __a
2635///    A 128-bit vector of [4 x float].
2636/// \param __b
2637///    A 128-bit vector of [4 x float].
2638/// \returns the ZF flag.
2639static __inline int __DEFAULT_FN_ATTRS
2640_mm_testz_ps(__m128 __a, __m128 __b)
2641{
2642  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2643}
2644
2645/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2646///    element-by-element comparison of the single-precision element in the
2647///    first source vector and the corresponding element in the second source
2648///    vector. The EFLAGS register is updated as follows: \n
2649///    If there is at least one pair of single-precision elements where the
2650///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2651///    ZF flag is set to 1. \n
2652///    If there is at least one pair of single-precision elements where the
2653///    sign-bit of the first element is 0 and the sign-bit of the second element
2654///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2655///    This intrinsic returns the value of the CF flag.
2656///
2657/// \headerfile <x86intrin.h>
2658///
2659/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2660///
2661/// \param __a
2662///    A 128-bit vector of [4 x float].
2663/// \param __b
2664///    A 128-bit vector of [4 x float].
2665/// \returns the CF flag.
2666static __inline int __DEFAULT_FN_ATTRS
2667_mm_testc_ps(__m128 __a, __m128 __b)
2668{
2669  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2670}
2671
2672/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2673///    element-by-element comparison of the single-precision element in the
2674///    first source vector and the corresponding element in the second source
2675///    vector. The EFLAGS register is updated as follows: \n
2676///    If there is at least one pair of single-precision elements where the
2677///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2678///    ZF flag is set to 1. \n
2679///    If there is at least one pair of single-precision elements where the
2680///    sign-bit of the first element is 0 and the sign-bit of the second element
2681///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2682///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2683///    otherwise it returns 0.
2684///
2685/// \headerfile <x86intrin.h>
2686///
2687/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2688///
2689/// \param __a
2690///    A 128-bit vector of [4 x float].
2691/// \param __b
2692///    A 128-bit vector of [4 x float].
2693/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2694static __inline int __DEFAULT_FN_ATTRS
2695_mm_testnzc_ps(__m128 __a, __m128 __b)
2696{
2697  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2698}
2699
2700/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2701///    element-by-element comparison of the double-precision elements in the
2702///    first source vector and the corresponding elements in the second source
2703///    vector. The EFLAGS register is updated as follows: \n
2704///    If there is at least one pair of double-precision elements where the
2705///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2706///    ZF flag is set to 1. \n
2707///    If there is at least one pair of double-precision elements where the
2708///    sign-bit of the first element is 0 and the sign-bit of the second element
2709///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2710///    This intrinsic returns the value of the ZF flag.
2711///
2712/// \headerfile <x86intrin.h>
2713///
2714/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2715///
2716/// \param __a
2717///    A 256-bit vector of [4 x double].
2718/// \param __b
2719///    A 256-bit vector of [4 x double].
2720/// \returns the ZF flag.
2721static __inline int __DEFAULT_FN_ATTRS
2722_mm256_testz_pd(__m256d __a, __m256d __b)
2723{
2724  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2725}
2726
2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2728///    element-by-element comparison of the double-precision elements in the
2729///    first source vector and the corresponding elements in the second source
2730///    vector. The EFLAGS register is updated as follows: \n
2731///    If there is at least one pair of double-precision elements where the
2732///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2733///    ZF flag is set to 1. \n
2734///    If there is at least one pair of double-precision elements where the
2735///    sign-bit of the first element is 0 and the sign-bit of the second element
2736///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2737///    This intrinsic returns the value of the CF flag.
2738///
2739/// \headerfile <x86intrin.h>
2740///
2741/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2742///
2743/// \param __a
2744///    A 256-bit vector of [4 x double].
2745/// \param __b
2746///    A 256-bit vector of [4 x double].
2747/// \returns the CF flag.
2748static __inline int __DEFAULT_FN_ATTRS
2749_mm256_testc_pd(__m256d __a, __m256d __b)
2750{
2751  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2752}
2753
2754/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2755///    element-by-element comparison of the double-precision elements in the
2756///    first source vector and the corresponding elements in the second source
2757///    vector. The EFLAGS register is updated as follows: \n
2758///    If there is at least one pair of double-precision elements where the
2759///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2760///    ZF flag is set to 1. \n
2761///    If there is at least one pair of double-precision elements where the
2762///    sign-bit of the first element is 0 and the sign-bit of the second element
2763///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2764///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2765///    otherwise it returns 0.
2766///
2767/// \headerfile <x86intrin.h>
2768///
2769/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2770///
2771/// \param __a
2772///    A 256-bit vector of [4 x double].
2773/// \param __b
2774///    A 256-bit vector of [4 x double].
2775/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2776static __inline int __DEFAULT_FN_ATTRS
2777_mm256_testnzc_pd(__m256d __a, __m256d __b)
2778{
2779  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2780}
2781
2782/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2783///    element-by-element comparison of the single-precision element in the
2784///    first source vector and the corresponding element in the second source
2785///    vector. The EFLAGS register is updated as follows: \n
2786///    If there is at least one pair of single-precision elements where the
2787///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2788///    ZF flag is set to 1. \n
2789///    If there is at least one pair of single-precision elements where the
2790///    sign-bit of the first element is 0 and the sign-bit of the second element
2791///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2792///    This intrinsic returns the value of the ZF flag.
2793///
2794/// \headerfile <x86intrin.h>
2795///
2796/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2797///
2798/// \param __a
2799///    A 256-bit vector of [8 x float].
2800/// \param __b
2801///    A 256-bit vector of [8 x float].
2802/// \returns the ZF flag.
2803static __inline int __DEFAULT_FN_ATTRS
2804_mm256_testz_ps(__m256 __a, __m256 __b)
2805{
2806  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2807}
2808
2809/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2810///    element-by-element comparison of the single-precision element in the
2811///    first source vector and the corresponding element in the second source
2812///    vector. The EFLAGS register is updated as follows: \n
2813///    If there is at least one pair of single-precision elements where the
2814///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2815///    ZF flag is set to 1. \n
2816///    If there is at least one pair of single-precision elements where the
2817///    sign-bit of the first element is 0 and the sign-bit of the second element
2818///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2819///    This intrinsic returns the value of the CF flag.
2820///
2821/// \headerfile <x86intrin.h>
2822///
2823/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2824///
2825/// \param __a
2826///    A 256-bit vector of [8 x float].
2827/// \param __b
2828///    A 256-bit vector of [8 x float].
2829/// \returns the CF flag.
2830static __inline int __DEFAULT_FN_ATTRS
2831_mm256_testc_ps(__m256 __a, __m256 __b)
2832{
2833  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2834}
2835
2836/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2837///    element-by-element comparison of the single-precision elements in the
2838///    first source vector and the corresponding elements in the second source
2839///    vector. The EFLAGS register is updated as follows: \n
2840///    If there is at least one pair of single-precision elements where the
2841///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2842///    ZF flag is set to 1. \n
2843///    If there is at least one pair of single-precision elements where the
2844///    sign-bit of the first element is 0 and the sign-bit of the second element
2845///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2846///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2847///    otherwise it returns 0.
2848///
2849/// \headerfile <x86intrin.h>
2850///
2851/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2852///
2853/// \param __a
2854///    A 256-bit vector of [8 x float].
2855/// \param __b
2856///    A 256-bit vector of [8 x float].
2857/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2858static __inline int __DEFAULT_FN_ATTRS
2859_mm256_testnzc_ps(__m256 __a, __m256 __b)
2860{
2861  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2862}
2863
2864/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2865///    of the two source vectors and update the EFLAGS register as follows: \n
2866///    If there is at least one pair of bits where both bits are 1, the ZF flag
2867///    is set to 0. Otherwise the ZF flag is set to 1. \n
2868///    If there is at least one pair of bits where the bit from the first source
2869///    vector is 0 and the bit from the second source vector is 1, the CF flag
2870///    is set to 0. Otherwise the CF flag is set to 1. \n
2871///    This intrinsic returns the value of the ZF flag.
2872///
2873/// \headerfile <x86intrin.h>
2874///
2875/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2876///
2877/// \param __a
2878///    A 256-bit integer vector.
2879/// \param __b
2880///    A 256-bit integer vector.
2881/// \returns the ZF flag.
2882static __inline int __DEFAULT_FN_ATTRS
2883_mm256_testz_si256(__m256i __a, __m256i __b)
2884{
2885  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2886}
2887
2888/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2889///    of the two source vectors and update the EFLAGS register as follows: \n
2890///    If there is at least one pair of bits where both bits are 1, the ZF flag
2891///    is set to 0. Otherwise the ZF flag is set to 1. \n
2892///    If there is at least one pair of bits where the bit from the first source
2893///    vector is 0 and the bit from the second source vector is 1, the CF flag
2894///    is set to 0. Otherwise the CF flag is set to 1. \n
2895///    This intrinsic returns the value of the CF flag.
2896///
2897/// \headerfile <x86intrin.h>
2898///
2899/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2900///
2901/// \param __a
2902///    A 256-bit integer vector.
2903/// \param __b
2904///    A 256-bit integer vector.
2905/// \returns the CF flag.
2906static __inline int __DEFAULT_FN_ATTRS
2907_mm256_testc_si256(__m256i __a, __m256i __b)
2908{
2909  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2910}
2911
2912/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2913///    of the two source vectors and update the EFLAGS register as follows: \n
2914///    If there is at least one pair of bits where both bits are 1, the ZF flag
2915///    is set to 0. Otherwise the ZF flag is set to 1. \n
2916///    If there is at least one pair of bits where the bit from the first source
2917///    vector is 0 and the bit from the second source vector is 1, the CF flag
2918///    is set to 0. Otherwise the CF flag is set to 1. \n
2919///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2920///    otherwise it returns 0.
2921///
2922/// \headerfile <x86intrin.h>
2923///
2924/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2925///
2926/// \param __a
2927///    A 256-bit integer vector.
2928/// \param __b
2929///    A 256-bit integer vector.
2930/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2931static __inline int __DEFAULT_FN_ATTRS
2932_mm256_testnzc_si256(__m256i __a, __m256i __b)
2933{
2934  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2935}
2936
2937/* Vector extract sign mask */
2938/// \brief Extracts the sign bits of double-precision floating point elements
2939///    in a 256-bit vector of [4 x double] and writes them to the lower order
2940///    bits of the return value.
2941///
2942/// \headerfile <x86intrin.h>
2943///
2944/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2945///
2946/// \param __a
2947///    A 256-bit vector of [4 x double] containing the double-precision
2948///    floating point values with sign bits to be extracted.
2949/// \returns The sign bits from the operand, written to bits [3:0].
2950static __inline int __DEFAULT_FN_ATTRS
2951_mm256_movemask_pd(__m256d __a)
2952{
2953  return __builtin_ia32_movmskpd256((__v4df)__a);
2954}
2955
2956/// \brief Extracts the sign bits of double-precision floating point elements
2957///    in a 256-bit vector of [8 x float] and writes them to the lower order
2958///    bits of the return value.
2959///
2960/// \headerfile <x86intrin.h>
2961///
2962/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2963///
2964/// \param __a
2965///    A 256-bit vector of [8 x float] containing the double-precision floating
2966///    point values with sign bits to be extracted.
2967/// \returns The sign bits from the operand, written to bits [7:0].
2968static __inline int __DEFAULT_FN_ATTRS
2969_mm256_movemask_ps(__m256 __a)
2970{
2971  return __builtin_ia32_movmskps256((__v8sf)__a);
2972}
2973
2974/* Vector __zero */
2975/// \brief Zeroes the contents of all XMM or YMM registers.
2976///
2977/// \headerfile <x86intrin.h>
2978///
2979/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2980static __inline void __DEFAULT_FN_ATTRS
2981_mm256_zeroall(void)
2982{
2983  __builtin_ia32_vzeroall();
2984}
2985
2986/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2987///
2988/// \headerfile <x86intrin.h>
2989///
2990/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2991static __inline void __DEFAULT_FN_ATTRS
2992_mm256_zeroupper(void)
2993{
2994  __builtin_ia32_vzeroupper();
2995}
2996
2997/* Vector load with broadcast */
2998/// \brief Loads a scalar single-precision floating point value from the
2999///    specified address pointed to by \a __a and broadcasts it to the elements
3000///    of a [4 x float] vector.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3005///
3006/// \param __a
3007///    The single-precision floating point value to be broadcast.
3008/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3009///    equal to the broadcast value.
3010static __inline __m128 __DEFAULT_FN_ATTRS
3011_mm_broadcast_ss(float const *__a)
3012{
3013  float __f = *__a;
3014  return (__m128)(__v4sf){ __f, __f, __f, __f };
3015}
3016
3017/// \brief Loads a scalar double-precision floating point value from the
3018///    specified address pointed to by \a __a and broadcasts it to the elements
3019///    of a [4 x double] vector.
3020///
3021/// \headerfile <x86intrin.h>
3022///
3023/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3024///
3025/// \param __a
3026///    The double-precision floating point value to be broadcast.
3027/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3028///    equal to the broadcast value.
3029static __inline __m256d __DEFAULT_FN_ATTRS
3030_mm256_broadcast_sd(double const *__a)
3031{
3032  double __d = *__a;
3033  return (__m256d)(__v4df){ __d, __d, __d, __d };
3034}
3035
3036/// \brief Loads a scalar single-precision floating point value from the
3037///    specified address pointed to by \a __a and broadcasts it to the elements
3038///    of a [8 x float] vector.
3039///
3040/// \headerfile <x86intrin.h>
3041///
3042/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3043///
3044/// \param __a
3045///    The single-precision floating point value to be broadcast.
3046/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3047///    equal to the broadcast value.
3048static __inline __m256 __DEFAULT_FN_ATTRS
3049_mm256_broadcast_ss(float const *__a)
3050{
3051  float __f = *__a;
3052  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3053}
3054
3055/// \brief Loads the data from a 128-bit vector of [2 x double] from the
3056///    specified address pointed to by \a __a and broadcasts it to 128-bit
3057///    elements in a 256-bit vector of [4 x double].
3058///
3059/// \headerfile <x86intrin.h>
3060///
3061/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3062///
3063/// \param __a
3064///    The 128-bit vector of [2 x double] to be broadcast.
3065/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3066///    equal to the broadcast value.
3067static __inline __m256d __DEFAULT_FN_ATTRS
3068_mm256_broadcast_pd(__m128d const *__a)
3069{
3070  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
3071}
3072
3073/// \brief Loads the data from a 128-bit vector of [4 x float] from the
3074///    specified address pointed to by \a __a and broadcasts it to 128-bit
3075///    elements in a 256-bit vector of [8 x float].
3076///
3077/// \headerfile <x86intrin.h>
3078///
3079/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3080///
3081/// \param __a
3082///    The 128-bit vector of [4 x float] to be broadcast.
3083/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3084///    equal to the broadcast value.
3085static __inline __m256 __DEFAULT_FN_ATTRS
3086_mm256_broadcast_ps(__m128 const *__a)
3087{
3088  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
3089}
3090
3091/* SIMD load ops */
3092/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
3093///    memory location pointed to by \a __p into a vector of [4 x double].
3094///
3095/// \headerfile <x86intrin.h>
3096///
3097/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3098///
3099/// \param __p
3100///    A 32-byte aligned pointer to a memory location containing
3101///    double-precision floating point values.
3102/// \returns A 256-bit vector of [4 x double] containing the moved values.
3103static __inline __m256d __DEFAULT_FN_ATTRS
3104_mm256_load_pd(double const *__p)
3105{
3106  return *(__m256d *)__p;
3107}
3108
3109/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
3110///    memory location pointed to by \a __p into a vector of [8 x float].
3111///
3112/// \headerfile <x86intrin.h>
3113///
3114/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3115///
3116/// \param __p
3117///    A 32-byte aligned pointer to a memory location containing float values.
3118/// \returns A 256-bit vector of [8 x float] containing the moved values.
3119static __inline __m256 __DEFAULT_FN_ATTRS
3120_mm256_load_ps(float const *__p)
3121{
3122  return *(__m256 *)__p;
3123}
3124
3125/// \brief Loads 4 double-precision floating point values from an unaligned
3126///    memory location pointed to by \a __p into a vector of [4 x double].
3127///
3128/// \headerfile <x86intrin.h>
3129///
3130/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3131///
3132/// \param __p
3133///    A pointer to a memory location containing double-precision floating
3134///    point values.
3135/// \returns A 256-bit vector of [4 x double] containing the moved values.
3136static __inline __m256d __DEFAULT_FN_ATTRS
3137_mm256_loadu_pd(double const *__p)
3138{
3139  struct __loadu_pd {
3140    __m256d __v;
3141  } __attribute__((__packed__, __may_alias__));
3142  return ((struct __loadu_pd*)__p)->__v;
3143}
3144
3145/// \brief Loads 8 single-precision floating point values from an unaligned
3146///    memory location pointed to by \a __p into a vector of [8 x float].
3147///
3148/// \headerfile <x86intrin.h>
3149///
3150/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3151///
3152/// \param __p
3153///    A pointer to a memory location containing single-precision floating
3154///    point values.
3155/// \returns A 256-bit vector of [8 x float] containing the moved values.
3156static __inline __m256 __DEFAULT_FN_ATTRS
3157_mm256_loadu_ps(float const *__p)
3158{
3159  struct __loadu_ps {
3160    __m256 __v;
3161  } __attribute__((__packed__, __may_alias__));
3162  return ((struct __loadu_ps*)__p)->__v;
3163}
3164
3165/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
3166///    location pointed to by \a __p into elements of a 256-bit integer vector.
3167///
3168/// \headerfile <x86intrin.h>
3169///
3170/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3171///
3172/// \param __p
3173///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3174///    values.
3175/// \returns A 256-bit integer vector containing the moved values.
3176static __inline __m256i __DEFAULT_FN_ATTRS
3177_mm256_load_si256(__m256i const *__p)
3178{
3179  return *__p;
3180}
3181
3182/// \brief Loads 256 bits of integer data from an unaligned memory location
3183///    pointed to by \a __p into a 256-bit integer vector.
3184///
3185/// \headerfile <x86intrin.h>
3186///
3187/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3188///
3189/// \param __p
3190///    A pointer to a 256-bit integer vector containing integer values.
3191/// \returns A 256-bit integer vector containing the moved values.
3192static __inline __m256i __DEFAULT_FN_ATTRS
3193_mm256_loadu_si256(__m256i const *__p)
3194{
3195  struct __loadu_si256 {
3196    __m256i __v;
3197  } __attribute__((__packed__, __may_alias__));
3198  return ((struct __loadu_si256*)__p)->__v;
3199}
3200
3201/// \brief Loads 256 bits of integer data from an unaligned memory location
3202///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3203///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3204///    line boundary.
3205///
3206/// \headerfile <x86intrin.h>
3207///
3208/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3209///
3210/// \param __p
3211///    A pointer to a 256-bit integer vector containing integer values.
3212/// \returns A 256-bit integer vector containing the moved values.
3213static __inline __m256i __DEFAULT_FN_ATTRS
3214_mm256_lddqu_si256(__m256i const *__p)
3215{
3216  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3217}
3218
3219/* SIMD store ops */
3220/// \brief Stores double-precision floating point values from a 256-bit vector
3221///    of [4 x double] to a 32-byte aligned memory location pointed to by
3222///    \a __p.
3223///
3224/// \headerfile <x86intrin.h>
3225///
3226/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3227///
3228/// \param __p
3229///    A 32-byte aligned pointer to a memory location that will receive the
3230///    double-precision floaing point values.
3231/// \param __a
3232///    A 256-bit vector of [4 x double] containing the values to be moved.
3233static __inline void __DEFAULT_FN_ATTRS
3234_mm256_store_pd(double *__p, __m256d __a)
3235{
3236  *(__m256d *)__p = __a;
3237}
3238
3239/// \brief Stores single-precision floating point values from a 256-bit vector
3240///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3241///
3242/// \headerfile <x86intrin.h>
3243///
3244/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3245///
3246/// \param __p
3247///    A 32-byte aligned pointer to a memory location that will receive the
3248///    float values.
3249/// \param __a
3250///    A 256-bit vector of [8 x float] containing the values to be moved.
3251static __inline void __DEFAULT_FN_ATTRS
3252_mm256_store_ps(float *__p, __m256 __a)
3253{
3254  *(__m256 *)__p = __a;
3255}
3256
3257/// \brief Stores double-precision floating point values from a 256-bit vector
3258///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3259///
3260/// \headerfile <x86intrin.h>
3261///
3262/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3263///
3264/// \param __p
3265///    A pointer to a memory location that will receive the double-precision
3266///    floating point values.
3267/// \param __a
3268///    A 256-bit vector of [4 x double] containing the values to be moved.
3269static __inline void __DEFAULT_FN_ATTRS
3270_mm256_storeu_pd(double *__p, __m256d __a)
3271{
3272  struct __storeu_pd {
3273    __m256d __v;
3274  } __attribute__((__packed__, __may_alias__));
3275  ((struct __storeu_pd*)__p)->__v = __a;
3276}
3277
3278/// \brief Stores single-precision floating point values from a 256-bit vector
3279///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3280///
3281/// \headerfile <x86intrin.h>
3282///
3283/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3284///
3285/// \param __p
3286///    A pointer to a memory location that will receive the float values.
3287/// \param __a
3288///    A 256-bit vector of [8 x float] containing the values to be moved.
3289static __inline void __DEFAULT_FN_ATTRS
3290_mm256_storeu_ps(float *__p, __m256 __a)
3291{
3292  struct __storeu_ps {
3293    __m256 __v;
3294  } __attribute__((__packed__, __may_alias__));
3295  ((struct __storeu_ps*)__p)->__v = __a;
3296}
3297
3298/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
3299///    aligned memory location pointed to by \a __p.
3300///
3301/// \headerfile <x86intrin.h>
3302///
3303/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3304///
3305/// \param __p
3306///    A 32-byte aligned pointer to a memory location that will receive the
3307///    integer values.
3308/// \param __a
3309///    A 256-bit integer vector containing the values to be moved.
3310static __inline void __DEFAULT_FN_ATTRS
3311_mm256_store_si256(__m256i *__p, __m256i __a)
3312{
3313  *__p = __a;
3314}
3315
3316/// \brief Stores integer values from a 256-bit integer vector to an unaligned
3317///    memory location pointed to by \a __p.
3318///
3319/// \headerfile <x86intrin.h>
3320///
3321/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3322///
3323/// \param __p
3324///    A pointer to a memory location that will receive the integer values.
3325/// \param __a
3326///    A 256-bit integer vector containing the values to be moved.
3327static __inline void __DEFAULT_FN_ATTRS
3328_mm256_storeu_si256(__m256i *__p, __m256i __a)
3329{
3330  struct __storeu_si256 {
3331    __m256i __v;
3332  } __attribute__((__packed__, __may_alias__));
3333  ((struct __storeu_si256*)__p)->__v = __a;
3334}
3335
3336/* Conditional load ops */
3337/// \brief Conditionally loads double-precision floating point elements from a
3338///    memory location pointed to by \a __p into a 128-bit vector of
3339///    [2 x double], depending on the mask bits associated with each data
3340///    element.
3341///
3342/// \headerfile <x86intrin.h>
3343///
3344/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3345///
3346/// \param __p
3347///    A pointer to a memory location that contains the double-precision
3348///    floating point values.
3349/// \param __m
3350///    A 128-bit integer vector containing the mask. The most significant bit of
3351///    each data element represents the mask bits. If a mask bit is zero, the
3352///    corresponding value in the memory location is not loaded and the
3353///    corresponding field in the return value is set to zero.
3354/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3355static __inline __m128d __DEFAULT_FN_ATTRS
3356_mm_maskload_pd(double const *__p, __m128i __m)
3357{
3358  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3359}
3360
3361/// \brief Conditionally loads double-precision floating point elements from a
3362///    memory location pointed to by \a __p into a 256-bit vector of
3363///    [4 x double], depending on the mask bits associated with each data
3364///    element.
3365///
3366/// \headerfile <x86intrin.h>
3367///
3368/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3369///
3370/// \param __p
3371///    A pointer to a memory location that contains the double-precision
3372///    floating point values.
3373/// \param __m
3374///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3375///    significant bit of each quadword element represents the mask bits. If a
3376///    mask bit is zero, the corresponding value in the memory location is not
3377///    loaded and the corresponding field in the return value is set to zero.
3378/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3379static __inline __m256d __DEFAULT_FN_ATTRS
3380_mm256_maskload_pd(double const *__p, __m256i __m)
3381{
3382  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3383                                               (__v4di)__m);
3384}
3385
3386/// \brief Conditionally loads single-precision floating point elements from a
3387///    memory location pointed to by \a __p into a 128-bit vector of
3388///    [4 x float], depending on the mask bits associated with each data
3389///    element.
3390///
3391/// \headerfile <x86intrin.h>
3392///
3393/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3394///
3395/// \param __p
3396///    A pointer to a memory location that contains the single-precision
3397///    floating point values.
3398/// \param __m
3399///    A 128-bit integer vector containing the mask. The most significant bit of
3400///    each data element represents the mask bits. If a mask bit is zero, the
3401///    corresponding value in the memory location is not loaded and the
3402///    corresponding field in the return value is set to zero.
3403/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3404static __inline __m128 __DEFAULT_FN_ATTRS
3405_mm_maskload_ps(float const *__p, __m128i __m)
3406{
3407  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3408}
3409
3410/// \brief Conditionally loads single-precision floating point elements from a
3411///    memory location pointed to by \a __p into a 256-bit vector of
3412///    [8 x float], depending on the mask bits associated with each data
3413///    element.
3414///
3415/// \headerfile <x86intrin.h>
3416///
3417/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3418///
3419/// \param __p
3420///    A pointer to a memory location that contains the single-precision
3421///    floating point values.
3422/// \param __m
3423///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3424///    significant bit of each dword element represents the mask bits. If a mask
3425///    bit is zero, the corresponding value in the memory location is not loaded
3426///    and the corresponding field in the return value is set to zero.
3427/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3428static __inline __m256 __DEFAULT_FN_ATTRS
3429_mm256_maskload_ps(float const *__p, __m256i __m)
3430{
3431  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3432}
3433
3434/* Conditional store ops */
3435/// \brief Moves single-precision floating point values from a 256-bit vector
3436///    of [8 x float] to a memory location pointed to by \a __p, according to
3437///    the specified mask.
3438///
3439/// \headerfile <x86intrin.h>
3440///
3441/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3442///
3443/// \param __p
3444///    A pointer to a memory location that will receive the float values.
3445/// \param __m
3446///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3447///    significant bit of each dword element in the mask vector represents the
3448///    mask bits. If a mask bit is zero, the corresponding value from vector
3449///    \a __a is not stored and the corresponding field in the memory location
3450///    pointed to by \a __p is not changed.
3451/// \param __a
3452///    A 256-bit vector of [8 x float] containing the values to be stored.
3453static __inline void __DEFAULT_FN_ATTRS
3454_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3455{
3456  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3457}
3458
3459/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
3460///    to a memory location pointed to by \a __p, according to the specified
3461///    mask.
3462///
3463/// \headerfile <x86intrin.h>
3464///
3465/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3466///
3467/// \param __p
3468///    A pointer to a memory location that will receive the float values.
3469/// \param __m
3470///    A 128-bit integer vector containing the mask. The most significant bit of
3471///    each field in the mask vector represents the mask bits. If a mask bit is
3472///    zero, the corresponding value from vector \a __a is not stored and the
3473///    corresponding field in the memory location pointed to by \a __p is not
3474///    changed.
3475/// \param __a
3476///    A 128-bit vector of [2 x double] containing the values to be stored.
3477static __inline void __DEFAULT_FN_ATTRS
3478_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3479{
3480  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3481}
3482
3483/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3484///    to a memory location pointed to by \a __p, according to the specified
3485///    mask.
3486///
3487/// \headerfile <x86intrin.h>
3488///
3489/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3490///
3491/// \param __p
3492///    A pointer to a memory location that will receive the float values.
3493/// \param __m
3494///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3495///    significant bit of each quadword element in the mask vector represents
3496///    the mask bits. If a mask bit is zero, the corresponding value from vector
3497///    __a is not stored and the corresponding field in the memory location
3498///    pointed to by \a __p is not changed.
3499/// \param __a
3500///    A 256-bit vector of [4 x double] containing the values to be stored.
3501static __inline void __DEFAULT_FN_ATTRS
3502_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3503{
3504  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3505}
3506
3507/// \brief Moves single-precision floating point values from a 128-bit vector
3508///    of [4 x float] to a memory location pointed to by \a __p, according to
3509///    the specified mask.
3510///
3511/// \headerfile <x86intrin.h>
3512///
3513/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3514///
3515/// \param __p
3516///    A pointer to a memory location that will receive the float values.
3517/// \param __m
3518///    A 128-bit integer vector containing the mask. The most significant bit of
3519///    each field in the mask vector represents the mask bits. If a mask bit is
3520///    zero, the corresponding value from vector __a is not stored and the
3521///    corresponding field in the memory location pointed to by \a __p is not
3522///    changed.
3523/// \param __a
3524///    A 128-bit vector of [4 x float] containing the values to be stored.
3525static __inline void __DEFAULT_FN_ATTRS
3526_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3527{
3528  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3529}
3530
3531/* Cacheability support ops */
3532/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3533///    aligned memory location. To minimize caching, the data is flagged as
3534///    non-temporal (unlikely to be used again soon).
3535///
3536/// \headerfile <x86intrin.h>
3537///
3538/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3539///
3540/// \param __a
3541///    A pointer to a 32-byte aligned memory location that will receive the
3542///    integer values.
3543/// \param __b
3544///    A 256-bit integer vector containing the values to be moved.
3545static __inline void __DEFAULT_FN_ATTRS
3546_mm256_stream_si256(__m256i *__a, __m256i __b)
3547{
3548  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
3549}
3550
3551/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3552///    to a 32-byte aligned memory location. To minimize caching, the data is
3553///    flagged as non-temporal (unlikely to be used again soon).
3554///
3555/// \headerfile <x86intrin.h>
3556///
3557/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3558///
3559/// \param __a
3560///    A pointer to a 32-byte aligned memory location that will receive the
3561///    integer values.
3562/// \param __b
3563///    A 256-bit vector of [4 x double] containing the values to be moved.
3564static __inline void __DEFAULT_FN_ATTRS
3565_mm256_stream_pd(double *__a, __m256d __b)
3566{
3567  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
3568}
3569
3570/// \brief Moves single-precision floating point values from a 256-bit vector
3571///    of [8 x float] to a 32-byte aligned memory location. To minimize
3572///    caching, the data is flagged as non-temporal (unlikely to be used again
3573///    soon).
3574///
3575/// \headerfile <x86intrin.h>
3576///
3577/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3578///
3579/// \param __p
3580///    A pointer to a 32-byte aligned memory location that will receive the
3581///    single-precision floating point values.
3582/// \param __a
3583///    A 256-bit vector of [8 x float] containing the values to be moved.
3584static __inline void __DEFAULT_FN_ATTRS
3585_mm256_stream_ps(float *__p, __m256 __a)
3586{
3587  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
3588}
3589
3590/* Create vectors */
3591/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3592///
3593/// \headerfile <x86intrin.h>
3594///
3595/// This intrinsic has no corresponding instruction.
3596///
3597/// \returns A 256-bit vector of [4 x double] containing undefined values.
3598static __inline__ __m256d __DEFAULT_FN_ATTRS
3599_mm256_undefined_pd(void)
3600{
3601  return (__m256d)__builtin_ia32_undef256();
3602}
3603
3604/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3605///
3606/// \headerfile <x86intrin.h>
3607///
3608/// This intrinsic has no corresponding instruction.
3609///
3610/// \returns A 256-bit vector of [8 x float] containing undefined values.
3611static __inline__ __m256 __DEFAULT_FN_ATTRS
3612_mm256_undefined_ps(void)
3613{
3614  return (__m256)__builtin_ia32_undef256();
3615}
3616
3617/// \brief Create a 256-bit integer vector with undefined values.
3618///
3619/// \headerfile <x86intrin.h>
3620///
3621/// This intrinsic has no corresponding instruction.
3622///
3623/// \returns A 256-bit integer vector containing undefined values.
3624static __inline__ __m256i __DEFAULT_FN_ATTRS
3625_mm256_undefined_si256(void)
3626{
3627  return (__m256i)__builtin_ia32_undef256();
3628}
3629
3630/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3631///    initialized with the specified double-precision floating-point values.
3632///
3633/// \headerfile <x86intrin.h>
3634///
3635/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3636///   instruction.
3637///
3638/// \param __a
3639///    A double-precision floating-point value used to initialize bits [255:192]
3640///    of the result.
3641/// \param __b
3642///    A double-precision floating-point value used to initialize bits [191:128]
3643///    of the result.
3644/// \param __c
3645///    A double-precision floating-point value used to initialize bits [127:64]
3646///    of the result.
3647/// \param __d
3648///    A double-precision floating-point value used to initialize bits [63:0]
3649///    of the result.
3650/// \returns An initialized 256-bit floating-point vector of [4 x double].
3651static __inline __m256d __DEFAULT_FN_ATTRS
3652_mm256_set_pd(double __a, double __b, double __c, double __d)
3653{
3654  return (__m256d){ __d, __c, __b, __a };
3655}
3656
3657/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3658///    with the specified single-precision floating-point values.
3659///
3660/// \headerfile <x86intrin.h>
3661///
3662/// This intrinsic is a utility function and does not correspond to a specific
3663///   instruction.
3664///
3665/// \param __a
3666///    A single-precision floating-point value used to initialize bits [255:224]
3667///    of the result.
3668/// \param __b
3669///    A single-precision floating-point value used to initialize bits [223:192]
3670///    of the result.
3671/// \param __c
3672///    A single-precision floating-point value used to initialize bits [191:160]
3673///    of the result.
3674/// \param __d
3675///    A single-precision floating-point value used to initialize bits [159:128]
3676///    of the result.
3677/// \param __e
3678///    A single-precision floating-point value used to initialize bits [127:96]
3679///    of the result.
3680/// \param __f
3681///    A single-precision floating-point value used to initialize bits [95:64]
3682///    of the result.
3683/// \param __g
3684///    A single-precision floating-point value used to initialize bits [63:32]
3685///    of the result.
3686/// \param __h
3687///    A single-precision floating-point value used to initialize bits [31:0]
3688///    of the result.
3689/// \returns An initialized 256-bit floating-point vector of [8 x float].
3690static __inline __m256 __DEFAULT_FN_ATTRS
3691_mm256_set_ps(float __a, float __b, float __c, float __d,
3692              float __e, float __f, float __g, float __h)
3693{
3694  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3695}
3696
3697/// \brief Constructs a 256-bit integer vector initialized with the specified
3698///    32-bit integral values.
3699///
3700/// \headerfile <x86intrin.h>
3701///
3702/// This intrinsic is a utility function and does not correspond to a specific
3703///   instruction.
3704///
3705/// \param __i0
3706///    A 32-bit integral value used to initialize bits [255:224] of the result.
3707/// \param __i1
3708///    A 32-bit integral value used to initialize bits [223:192] of the result.
3709/// \param __i2
3710///    A 32-bit integral value used to initialize bits [191:160] of the result.
3711/// \param __i3
3712///    A 32-bit integral value used to initialize bits [159:128] of the result.
3713/// \param __i4
3714///    A 32-bit integral value used to initialize bits [127:96] of the result.
3715/// \param __i5
3716///    A 32-bit integral value used to initialize bits [95:64] of the result.
3717/// \param __i6
3718///    A 32-bit integral value used to initialize bits [63:32] of the result.
3719/// \param __i7
3720///    A 32-bit integral value used to initialize bits [31:0] of the result.
3721/// \returns An initialized 256-bit integer vector.
3722static __inline __m256i __DEFAULT_FN_ATTRS
3723_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3724                 int __i4, int __i5, int __i6, int __i7)
3725{
3726  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3727}
3728
3729/// \brief Constructs a 256-bit integer vector initialized with the specified
3730///    16-bit integral values.
3731///
3732/// \headerfile <x86intrin.h>
3733///
3734/// This intrinsic is a utility function and does not correspond to a specific
3735///   instruction.
3736///
3737/// \param __w15
3738///    A 16-bit integral value used to initialize bits [255:240] of the result.
3739/// \param __w14
3740///    A 16-bit integral value used to initialize bits [239:224] of the result.
3741/// \param __w13
3742///    A 16-bit integral value used to initialize bits [223:208] of the result.
3743/// \param __w12
3744///    A 16-bit integral value used to initialize bits [207:192] of the result.
3745/// \param __w11
3746///    A 16-bit integral value used to initialize bits [191:176] of the result.
3747/// \param __w10
3748///    A 16-bit integral value used to initialize bits [175:160] of the result.
3749/// \param __w09
3750///    A 16-bit integral value used to initialize bits [159:144] of the result.
3751/// \param __w08
3752///    A 16-bit integral value used to initialize bits [143:128] of the result.
3753/// \param __w07
3754///    A 16-bit integral value used to initialize bits [127:112] of the result.
3755/// \param __w06
3756///    A 16-bit integral value used to initialize bits [111:96] of the result.
3757/// \param __w05
3758///    A 16-bit integral value used to initialize bits [95:80] of the result.
3759/// \param __w04
3760///    A 16-bit integral value used to initialize bits [79:64] of the result.
3761/// \param __w03
3762///    A 16-bit integral value used to initialize bits [63:48] of the result.
3763/// \param __w02
3764///    A 16-bit integral value used to initialize bits [47:32] of the result.
3765/// \param __w01
3766///    A 16-bit integral value used to initialize bits [31:16] of the result.
3767/// \param __w00
3768///    A 16-bit integral value used to initialize bits [15:0] of the result.
3769/// \returns An initialized 256-bit integer vector.
3770static __inline __m256i __DEFAULT_FN_ATTRS
3771_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3772                 short __w11, short __w10, short __w09, short __w08,
3773                 short __w07, short __w06, short __w05, short __w04,
3774                 short __w03, short __w02, short __w01, short __w00)
3775{
3776  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3777    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3778}
3779
3780/// \brief Constructs a 256-bit integer vector initialized with the specified
3781///    8-bit integral values.
3782///
3783/// \headerfile <x86intrin.h>
3784///
3785/// This intrinsic is a utility function and does not correspond to a specific
3786///   instruction.
3787///
3788/// \param __b31
3789///    An 8-bit integral value used to initialize bits [255:248] of the result.
3790/// \param __b30
3791///    An 8-bit integral value used to initialize bits [247:240] of the result.
3792/// \param __b29
3793///    An 8-bit integral value used to initialize bits [239:232] of the result.
3794/// \param __b28
3795///    An 8-bit integral value used to initialize bits [231:224] of the result.
3796/// \param __b27
3797///    An 8-bit integral value used to initialize bits [223:216] of the result.
3798/// \param __b26
3799///    An 8-bit integral value used to initialize bits [215:208] of the result.
3800/// \param __b25
3801///    An 8-bit integral value used to initialize bits [207:200] of the result.
3802/// \param __b24
3803///    An 8-bit integral value used to initialize bits [199:192] of the result.
3804/// \param __b23
3805///    An 8-bit integral value used to initialize bits [191:184] of the result.
3806/// \param __b22
3807///    An 8-bit integral value used to initialize bits [183:176] of the result.
3808/// \param __b21
3809///    An 8-bit integral value used to initialize bits [175:168] of the result.
3810/// \param __b20
3811///    An 8-bit integral value used to initialize bits [167:160] of the result.
3812/// \param __b19
3813///    An 8-bit integral value used to initialize bits [159:152] of the result.
3814/// \param __b18
3815///    An 8-bit integral value used to initialize bits [151:144] of the result.
3816/// \param __b17
3817///    An 8-bit integral value used to initialize bits [143:136] of the result.
3818/// \param __b16
3819///    An 8-bit integral value used to initialize bits [135:128] of the result.
3820/// \param __b15
3821///    An 8-bit integral value used to initialize bits [127:120] of the result.
3822/// \param __b14
3823///    An 8-bit integral value used to initialize bits [119:112] of the result.
3824/// \param __b13
3825///    An 8-bit integral value used to initialize bits [111:104] of the result.
3826/// \param __b12
3827///    An 8-bit integral value used to initialize bits [103:96] of the result.
3828/// \param __b11
3829///    An 8-bit integral value used to initialize bits [95:88] of the result.
3830/// \param __b10
3831///    An 8-bit integral value used to initialize bits [87:80] of the result.
3832/// \param __b09
3833///    An 8-bit integral value used to initialize bits [79:72] of the result.
3834/// \param __b08
3835///    An 8-bit integral value used to initialize bits [71:64] of the result.
3836/// \param __b07
3837///    An 8-bit integral value used to initialize bits [63:56] of the result.
3838/// \param __b06
3839///    An 8-bit integral value used to initialize bits [55:48] of the result.
3840/// \param __b05
3841///    An 8-bit integral value used to initialize bits [47:40] of the result.
3842/// \param __b04
3843///    An 8-bit integral value used to initialize bits [39:32] of the result.
3844/// \param __b03
3845///    An 8-bit integral value used to initialize bits [31:24] of the result.
3846/// \param __b02
3847///    An 8-bit integral value used to initialize bits [23:16] of the result.
3848/// \param __b01
3849///    An 8-bit integral value used to initialize bits [15:8] of the result.
3850/// \param __b00
3851///    An 8-bit integral value used to initialize bits [7:0] of the result.
3852/// \returns An initialized 256-bit integer vector.
3853static __inline __m256i __DEFAULT_FN_ATTRS
3854_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3855                char __b27, char __b26, char __b25, char __b24,
3856                char __b23, char __b22, char __b21, char __b20,
3857                char __b19, char __b18, char __b17, char __b16,
3858                char __b15, char __b14, char __b13, char __b12,
3859                char __b11, char __b10, char __b09, char __b08,
3860                char __b07, char __b06, char __b05, char __b04,
3861                char __b03, char __b02, char __b01, char __b00)
3862{
3863  return (__m256i)(__v32qi){
3864    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3865    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3866    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3867    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3868  };
3869}
3870
3871/// \brief Constructs a 256-bit integer vector initialized with the specified
3872///    64-bit integral values.
3873///
3874/// \headerfile <x86intrin.h>
3875///
3876/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3877///   instruction.
3878///
3879/// \param __a
3880///    A 64-bit integral value used to initialize bits [255:192] of the result.
3881/// \param __b
3882///    A 64-bit integral value used to initialize bits [191:128] of the result.
3883/// \param __c
3884///    A 64-bit integral value used to initialize bits [127:64] of the result.
3885/// \param __d
3886///    A 64-bit integral value used to initialize bits [63:0] of the result.
3887/// \returns An initialized 256-bit integer vector.
3888static __inline __m256i __DEFAULT_FN_ATTRS
3889_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3890{
3891  return (__m256i)(__v4di){ __d, __c, __b, __a };
3892}
3893
3894/* Create vectors with elements in reverse order */
3895/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3896///    initialized in reverse order with the specified double-precision
3897///    floating-point values.
3898///
3899/// \headerfile <x86intrin.h>
3900///
3901/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3902///   instruction.
3903///
3904/// \param __a
3905///    A double-precision floating-point value used to initialize bits [63:0]
3906///    of the result.
3907/// \param __b
3908///    A double-precision floating-point value used to initialize bits [127:64]
3909///    of the result.
3910/// \param __c
3911///    A double-precision floating-point value used to initialize bits [191:128]
3912///    of the result.
3913/// \param __d
3914///    A double-precision floating-point value used to initialize bits [255:192]
3915///    of the result.
3916/// \returns An initialized 256-bit floating-point vector of [4 x double].
3917static __inline __m256d __DEFAULT_FN_ATTRS
3918_mm256_setr_pd(double __a, double __b, double __c, double __d)
3919{
3920  return (__m256d){ __a, __b, __c, __d };
3921}
3922
3923/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3924///    initialized in reverse order with the specified single-precision
3925///    float-point values.
3926///
3927/// \headerfile <x86intrin.h>
3928///
3929/// This intrinsic is a utility function and does not correspond to a specific
3930///   instruction.
3931///
3932/// \param __a
3933///    A single-precision floating-point value used to initialize bits [31:0]
3934///    of the result.
3935/// \param __b
3936///    A single-precision floating-point value used to initialize bits [63:32]
3937///    of the result.
3938/// \param __c
3939///    A single-precision floating-point value used to initialize bits [95:64]
3940///    of the result.
3941/// \param __d
3942///    A single-precision floating-point value used to initialize bits [127:96]
3943///    of the result.
3944/// \param __e
3945///    A single-precision floating-point value used to initialize bits [159:128]
3946///    of the result.
3947/// \param __f
3948///    A single-precision floating-point value used to initialize bits [191:160]
3949///    of the result.
3950/// \param __g
3951///    A single-precision floating-point value used to initialize bits [223:192]
3952///    of the result.
3953/// \param __h
3954///    A single-precision floating-point value used to initialize bits [255:224]
3955///    of the result.
3956/// \returns An initialized 256-bit floating-point vector of [8 x float].
3957static __inline __m256 __DEFAULT_FN_ATTRS
3958_mm256_setr_ps(float __a, float __b, float __c, float __d,
3959               float __e, float __f, float __g, float __h)
3960{
3961  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
3962}
3963
3964/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3965///    with the specified 32-bit integral values.
3966///
3967/// \headerfile <x86intrin.h>
3968///
3969/// This intrinsic is a utility function and does not correspond to a specific
3970///   instruction.
3971///
3972/// \param __i0
3973///    A 32-bit integral value used to initialize bits [31:0] of the result.
3974/// \param __i1
3975///    A 32-bit integral value used to initialize bits [63:32] of the result.
3976/// \param __i2
3977///    A 32-bit integral value used to initialize bits [95:64] of the result.
3978/// \param __i3
3979///    A 32-bit integral value used to initialize bits [127:96] of the result.
3980/// \param __i4
3981///    A 32-bit integral value used to initialize bits [159:128] of the result.
3982/// \param __i5
3983///    A 32-bit integral value used to initialize bits [191:160] of the result.
3984/// \param __i6
3985///    A 32-bit integral value used to initialize bits [223:192] of the result.
3986/// \param __i7
3987///    A 32-bit integral value used to initialize bits [255:224] of the result.
3988/// \returns An initialized 256-bit integer vector.
3989static __inline __m256i __DEFAULT_FN_ATTRS
3990_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3991                  int __i4, int __i5, int __i6, int __i7)
3992{
3993  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
3994}
3995
3996/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3997///    with the specified 16-bit integral values.
3998///
3999/// \headerfile <x86intrin.h>
4000///
4001/// This intrinsic is a utility function and does not correspond to a specific
4002///   instruction.
4003///
4004/// \param __w15
4005///    A 16-bit integral value used to initialize bits [15:0] of the result.
4006/// \param __w14
4007///    A 16-bit integral value used to initialize bits [31:16] of the result.
4008/// \param __w13
4009///    A 16-bit integral value used to initialize bits [47:32] of the result.
4010/// \param __w12
4011///    A 16-bit integral value used to initialize bits [63:48] of the result.
4012/// \param __w11
4013///    A 16-bit integral value used to initialize bits [79:64] of the result.
4014/// \param __w10
4015///    A 16-bit integral value used to initialize bits [95:80] of the result.
4016/// \param __w09
4017///    A 16-bit integral value used to initialize bits [111:96] of the result.
4018/// \param __w08
4019///    A 16-bit integral value used to initialize bits [127:112] of the result.
4020/// \param __w07
4021///    A 16-bit integral value used to initialize bits [143:128] of the result.
4022/// \param __w06
4023///    A 16-bit integral value used to initialize bits [159:144] of the result.
4024/// \param __w05
4025///    A 16-bit integral value used to initialize bits [175:160] of the result.
4026/// \param __w04
4027///    A 16-bit integral value used to initialize bits [191:176] of the result.
4028/// \param __w03
4029///    A 16-bit integral value used to initialize bits [207:192] of the result.
4030/// \param __w02
4031///    A 16-bit integral value used to initialize bits [223:208] of the result.
4032/// \param __w01
4033///    A 16-bit integral value used to initialize bits [239:224] of the result.
4034/// \param __w00
4035///    A 16-bit integral value used to initialize bits [255:240] of the result.
4036/// \returns An initialized 256-bit integer vector.
4037static __inline __m256i __DEFAULT_FN_ATTRS
4038_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4039       short __w11, short __w10, short __w09, short __w08,
4040       short __w07, short __w06, short __w05, short __w04,
4041       short __w03, short __w02, short __w01, short __w00)
4042{
4043  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
4044    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
4045}
4046
4047/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4048///    with the specified 8-bit integral values.
4049///
4050/// \headerfile <x86intrin.h>
4051///
4052/// This intrinsic is a utility function and does not correspond to a specific
4053///   instruction.
4054///
4055/// \param __b31
4056///    An 8-bit integral value used to initialize bits [7:0] of the result.
4057/// \param __b30
4058///    An 8-bit integral value used to initialize bits [15:8] of the result.
4059/// \param __b29
4060///    An 8-bit integral value used to initialize bits [23:16] of the result.
4061/// \param __b28
4062///    An 8-bit integral value used to initialize bits [31:24] of the result.
4063/// \param __b27
4064///    An 8-bit integral value used to initialize bits [39:32] of the result.
4065/// \param __b26
4066///    An 8-bit integral value used to initialize bits [47:40] of the result.
4067/// \param __b25
4068///    An 8-bit integral value used to initialize bits [55:48] of the result.
4069/// \param __b24
4070///    An 8-bit integral value used to initialize bits [63:56] of the result.
4071/// \param __b23
4072///    An 8-bit integral value used to initialize bits [71:64] of the result.
4073/// \param __b22
4074///    An 8-bit integral value used to initialize bits [79:72] of the result.
4075/// \param __b21
4076///    An 8-bit integral value used to initialize bits [87:80] of the result.
4077/// \param __b20
4078///    An 8-bit integral value used to initialize bits [95:88] of the result.
4079/// \param __b19
4080///    An 8-bit integral value used to initialize bits [103:96] of the result.
4081/// \param __b18
4082///    An 8-bit integral value used to initialize bits [111:104] of the result.
4083/// \param __b17
4084///    An 8-bit integral value used to initialize bits [119:112] of the result.
4085/// \param __b16
4086///    An 8-bit integral value used to initialize bits [127:120] of the result.
4087/// \param __b15
4088///    An 8-bit integral value used to initialize bits [135:128] of the result.
4089/// \param __b14
4090///    An 8-bit integral value used to initialize bits [143:136] of the result.
4091/// \param __b13
4092///    An 8-bit integral value used to initialize bits [151:144] of the result.
4093/// \param __b12
4094///    An 8-bit integral value used to initialize bits [159:152] of the result.
4095/// \param __b11
4096///    An 8-bit integral value used to initialize bits [167:160] of the result.
4097/// \param __b10
4098///    An 8-bit integral value used to initialize bits [175:168] of the result.
4099/// \param __b09
4100///    An 8-bit integral value used to initialize bits [183:176] of the result.
4101/// \param __b08
4102///    An 8-bit integral value used to initialize bits [191:184] of the result.
4103/// \param __b07
4104///    An 8-bit integral value used to initialize bits [199:192] of the result.
4105/// \param __b06
4106///    An 8-bit integral value used to initialize bits [207:200] of the result.
4107/// \param __b05
4108///    An 8-bit integral value used to initialize bits [215:208] of the result.
4109/// \param __b04
4110///    An 8-bit integral value used to initialize bits [223:216] of the result.
4111/// \param __b03
4112///    An 8-bit integral value used to initialize bits [231:224] of the result.
4113/// \param __b02
4114///    An 8-bit integral value used to initialize bits [239:232] of the result.
4115/// \param __b01
4116///    An 8-bit integral value used to initialize bits [247:240] of the result.
4117/// \param __b00
4118///    An 8-bit integral value used to initialize bits [255:248] of the result.
4119/// \returns An initialized 256-bit integer vector.
4120static __inline __m256i __DEFAULT_FN_ATTRS
4121_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4122                 char __b27, char __b26, char __b25, char __b24,
4123                 char __b23, char __b22, char __b21, char __b20,
4124                 char __b19, char __b18, char __b17, char __b16,
4125                 char __b15, char __b14, char __b13, char __b12,
4126                 char __b11, char __b10, char __b09, char __b08,
4127                 char __b07, char __b06, char __b05, char __b04,
4128                 char __b03, char __b02, char __b01, char __b00)
4129{
4130  return (__m256i)(__v32qi){
4131    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
4132    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
4133    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
4134    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
4135}
4136
4137/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4138///    with the specified 64-bit integral values.
4139///
4140/// \headerfile <x86intrin.h>
4141///
4142/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4143///   instruction.
4144///
4145/// \param __a
4146///    A 64-bit integral value used to initialize bits [63:0] of the result.
4147/// \param __b
4148///    A 64-bit integral value used to initialize bits [127:64] of the result.
4149/// \param __c
4150///    A 64-bit integral value used to initialize bits [191:128] of the result.
4151/// \param __d
4152///    A 64-bit integral value used to initialize bits [255:192] of the result.
4153/// \returns An initialized 256-bit integer vector.
4154static __inline __m256i __DEFAULT_FN_ATTRS
4155_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4156{
4157  return (__m256i)(__v4di){ __a, __b, __c, __d };
4158}
4159
4160/* Create vectors with repeated elements */
4161/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4162///    of the four double-precision floating-point vector elements set to the
4163///    specified double-precision floating-point value.
4164///
4165/// \headerfile <x86intrin.h>
4166///
4167/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4168///
4169/// \param __w
4170///    A double-precision floating-point value used to initialize each vector
4171///    element of the result.
4172/// \returns An initialized 256-bit floating-point vector of [4 x double].
4173static __inline __m256d __DEFAULT_FN_ATTRS
4174_mm256_set1_pd(double __w)
4175{
4176  return (__m256d){ __w, __w, __w, __w };
4177}
4178
4179/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4180///    of the eight single-precision floating-point vector elements set to the
4181///    specified single-precision floating-point value.
4182///
4183/// \headerfile <x86intrin.h>
4184///
4185/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4186///   instruction.
4187///
4188/// \param __w
4189///    A single-precision floating-point value used to initialize each vector
4190///    element of the result.
4191/// \returns An initialized 256-bit floating-point vector of [8 x float].
4192static __inline __m256 __DEFAULT_FN_ATTRS
4193_mm256_set1_ps(float __w)
4194{
4195  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
4196}
4197
4198/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4199///    32-bit integral vector elements set to the specified 32-bit integral
4200///    value.
4201///
4202/// \headerfile <x86intrin.h>
4203///
4204/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4205///   instruction.
4206///
4207/// \param __i
4208///    A 32-bit integral value used to initialize each vector element of the
4209///    result.
4210/// \returns An initialized 256-bit integer vector of [8 x i32].
4211static __inline __m256i __DEFAULT_FN_ATTRS
4212_mm256_set1_epi32(int __i)
4213{
4214  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
4215}
4216
4217/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4218///    16-bit integral vector elements set to the specified 16-bit integral
4219///    value.
4220///
4221/// \headerfile <x86intrin.h>
4222///
4223/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4224///
4225/// \param __w
4226///    A 16-bit integral value used to initialize each vector element of the
4227///    result.
4228/// \returns An initialized 256-bit integer vector of [16 x i16].
4229static __inline __m256i __DEFAULT_FN_ATTRS
4230_mm256_set1_epi16(short __w)
4231{
4232  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4233    __w, __w, __w, __w, __w, __w };
4234}
4235
4236/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4237///    8-bit integral vector elements set to the specified 8-bit integral value.
4238///
4239/// \headerfile <x86intrin.h>
4240///
4241/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4242///
4243/// \param __b
4244///    An 8-bit integral value used to initialize each vector element of the
4245///    result.
4246/// \returns An initialized 256-bit integer vector of [32 x i8].
4247static __inline __m256i __DEFAULT_FN_ATTRS
4248_mm256_set1_epi8(char __b)
4249{
4250  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4251    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4252    __b, __b, __b, __b, __b, __b, __b };
4253}
4254
4255/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4256///    64-bit integral vector elements set to the specified 64-bit integral
4257///    value.
4258///
4259/// \headerfile <x86intrin.h>
4260///
4261/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4262///
4263/// \param __q
4264///    A 64-bit integral value used to initialize each vector element of the
4265///    result.
4266/// \returns An initialized 256-bit integer vector of [4 x i64].
4267static __inline __m256i __DEFAULT_FN_ATTRS
4268_mm256_set1_epi64x(long long __q)
4269{
4270  return (__m256i)(__v4di){ __q, __q, __q, __q };
4271}
4272
4273/* Create __zeroed vectors */
4274/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4275///    vector elements initialized to zero.
4276///
4277/// \headerfile <x86intrin.h>
4278///
4279/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4280///
4281/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4282static __inline __m256d __DEFAULT_FN_ATTRS
4283_mm256_setzero_pd(void)
4284{
4285  return (__m256d){ 0, 0, 0, 0 };
4286}
4287
4288/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4289///    vector elements initialized to zero.
4290///
4291/// \headerfile <x86intrin.h>
4292///
4293/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4294///
4295/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4296static __inline __m256 __DEFAULT_FN_ATTRS
4297_mm256_setzero_ps(void)
4298{
4299  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4300}
4301
4302/// \brief Constructs a 256-bit integer vector initialized to zero.
4303///
4304/// \headerfile <x86intrin.h>
4305///
4306/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4307///
4308/// \returns A 256-bit integer vector initialized to zero.
4309static __inline __m256i __DEFAULT_FN_ATTRS
4310_mm256_setzero_si256(void)
4311{
4312  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4313}
4314
4315/* Cast between vector types */
4316/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4317///    floating-point vector of [8 x float].
4318///
4319/// \headerfile <x86intrin.h>
4320///
4321/// This intrinsic has no corresponding instruction.
4322///
4323/// \param __a
4324///    A 256-bit floating-point vector of [4 x double].
4325/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4326///    bitwise pattern as the parameter.
4327static __inline __m256 __DEFAULT_FN_ATTRS
4328_mm256_castpd_ps(__m256d __a)
4329{
4330  return (__m256)__a;
4331}
4332
4333/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4334///    integer vector.
4335///
4336/// \headerfile <x86intrin.h>
4337///
4338/// This intrinsic has no corresponding instruction.
4339///
4340/// \param __a
4341///    A 256-bit floating-point vector of [4 x double].
4342/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4343///    parameter.
4344static __inline __m256i __DEFAULT_FN_ATTRS
4345_mm256_castpd_si256(__m256d __a)
4346{
4347  return (__m256i)__a;
4348}
4349
4350/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4351///    floating-point vector of [4 x double].
4352///
4353/// \headerfile <x86intrin.h>
4354///
4355/// This intrinsic has no corresponding instruction.
4356///
4357/// \param __a
4358///    A 256-bit floating-point vector of [8 x float].
4359/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4360///    bitwise pattern as the parameter.
4361static __inline __m256d __DEFAULT_FN_ATTRS
4362_mm256_castps_pd(__m256 __a)
4363{
4364  return (__m256d)__a;
4365}
4366
4367/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4368///    integer vector.
4369///
4370/// \headerfile <x86intrin.h>
4371///
4372/// This intrinsic has no corresponding instruction.
4373///
4374/// \param __a
4375///    A 256-bit floating-point vector of [8 x float].
4376/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4377///    parameter.
4378static __inline __m256i __DEFAULT_FN_ATTRS
4379_mm256_castps_si256(__m256 __a)
4380{
4381  return (__m256i)__a;
4382}
4383
4384/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4385///    of [8 x float].
4386///
4387/// \headerfile <x86intrin.h>
4388///
4389/// This intrinsic has no corresponding instruction.
4390///
4391/// \param __a
4392///    A 256-bit integer vector.
4393/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4394///    bitwise pattern as the parameter.
4395static __inline __m256 __DEFAULT_FN_ATTRS
4396_mm256_castsi256_ps(__m256i __a)
4397{
4398  return (__m256)__a;
4399}
4400
4401/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4402///    of [4 x double].
4403///
4404/// \headerfile <x86intrin.h>
4405///
4406/// This intrinsic has no corresponding instruction.
4407///
4408/// \param __a
4409///    A 256-bit integer vector.
4410/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4411///    bitwise pattern as the parameter.
4412static __inline __m256d __DEFAULT_FN_ATTRS
4413_mm256_castsi256_pd(__m256i __a)
4414{
4415  return (__m256d)__a;
4416}
4417
4418/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4419///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4420///
4421/// \headerfile <x86intrin.h>
4422///
4423/// This intrinsic has no corresponding instruction.
4424///
4425/// \param __a
4426///    A 256-bit floating-point vector of [4 x double].
4427/// \returns A 128-bit floating-point vector of [2 x double] containing the
4428///    lower 128 bits of the parameter.
4429static __inline __m128d __DEFAULT_FN_ATTRS
4430_mm256_castpd256_pd128(__m256d __a)
4431{
4432  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4433}
4434
4435/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4436///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4437///
4438/// \headerfile <x86intrin.h>
4439///
4440/// This intrinsic has no corresponding instruction.
4441///
4442/// \param __a
4443///    A 256-bit floating-point vector of [8 x float].
4444/// \returns A 128-bit floating-point vector of [4 x float] containing the
4445///    lower 128 bits of the parameter.
4446static __inline __m128 __DEFAULT_FN_ATTRS
4447_mm256_castps256_ps128(__m256 __a)
4448{
4449  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4450}
4451
4452/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4453///
4454/// \headerfile <x86intrin.h>
4455///
4456/// This intrinsic has no corresponding instruction.
4457///
4458/// \param __a
4459///    A 256-bit integer vector.
4460/// \returns A 128-bit integer vector containing the lower 128 bits of the
4461///    parameter.
4462static __inline __m128i __DEFAULT_FN_ATTRS
4463_mm256_castsi256_si128(__m256i __a)
4464{
4465  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4466}
4467
4468/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4469///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4470///    contain the value of the source vector. The contents of the upper 128
4471///    bits are undefined.
4472///
4473/// \headerfile <x86intrin.h>
4474///
4475/// This intrinsic has no corresponding instruction.
4476///
4477/// \param __a
4478///    A 128-bit vector of [2 x double].
4479/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4480///    contain the value of the parameter. The contents of the upper 128 bits
4481///    are undefined.
4482static __inline __m256d __DEFAULT_FN_ATTRS
4483_mm256_castpd128_pd256(__m128d __a)
4484{
4485  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4486}
4487
4488/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4489///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4490///    the value of the source vector. The contents of the upper 128 bits are
4491///    undefined.
4492///
4493/// \headerfile <x86intrin.h>
4494///
4495/// This intrinsic has no corresponding instruction.
4496///
4497/// \param __a
4498///    A 128-bit vector of [4 x float].
4499/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4500///    contain the value of the parameter. The contents of the upper 128 bits
4501///    are undefined.
4502static __inline __m256 __DEFAULT_FN_ATTRS
4503_mm256_castps128_ps256(__m128 __a)
4504{
4505  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4506}
4507
4508/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4509///    The lower 128 bits contain the value of the source vector. The contents
4510///    of the upper 128 bits are undefined.
4511///
4512/// \headerfile <x86intrin.h>
4513///
4514/// This intrinsic has no corresponding instruction.
4515///
4516/// \param __a
4517///    A 128-bit integer vector.
4518/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4519///    the parameter. The contents of the upper 128 bits are undefined.
4520static __inline __m256i __DEFAULT_FN_ATTRS
4521_mm256_castsi128_si256(__m128i __a)
4522{
4523  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4524}
4525
4526/*
4527   Vector insert.
4528   We use macros rather than inlines because we only want to accept
4529   invocations where the immediate M is a constant expression.
4530*/
4531/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4532///    a 256-bit vector of [8 x float] given in the first parameter, and then
4533///    replacing either the upper or the lower 128 bits with the contents of a
4534///    128-bit vector of [4 x float] in the second parameter. The immediate
4535///    integer parameter determines between the upper or the lower 128 bits.
4536///
4537/// \headerfile <x86intrin.h>
4538///
4539/// \code
4540/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4541/// \endcode
4542///
4543/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4544///
4545/// \param V1
4546///    A 256-bit vector of [8 x float]. This vector is copied to the result
4547///    first, and then either the upper or the lower 128 bits of the result will
4548///    be replaced by the contents of \a V2.
4549/// \param V2
4550///    A 128-bit vector of [4 x float]. The contents of this parameter are
4551///    written to either the upper or the lower 128 bits of the result depending
4552///    on the value of parameter \a M.
4553/// \param M
4554///    An immediate integer. The least significant bit determines how the values
4555///    from the two parameters are interleaved: \n
4556///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4557///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4558///    result. \n
4559///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4560///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4561///    result.
4562/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4563#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4564  (__m256)__builtin_shufflevector( \
4565    (__v8sf)(__m256)(V1), \
4566    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4567    (((M) & 1) ?  0 :  8), \
4568    (((M) & 1) ?  1 :  9), \
4569    (((M) & 1) ?  2 : 10), \
4570    (((M) & 1) ?  3 : 11), \
4571    (((M) & 1) ?  8 :  4), \
4572    (((M) & 1) ?  9 :  5), \
4573    (((M) & 1) ? 10 :  6), \
4574    (((M) & 1) ? 11 :  7) );})
4575
4576/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4577///    a 256-bit vector of [4 x double] given in the first parameter, and then
4578///    replacing either the upper or the lower 128 bits with the contents of a
4579///    128-bit vector of [2 x double] in the second parameter. The immediate
4580///    integer parameter determines between the upper or the lower 128 bits.
4581///
4582/// \headerfile <x86intrin.h>
4583///
4584/// \code
4585/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4586/// \endcode
4587///
4588/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4589///
4590/// \param V1
4591///    A 256-bit vector of [4 x double]. This vector is copied to the result
4592///    first, and then either the upper or the lower 128 bits of the result will
4593///    be replaced by the contents of \a V2.
4594/// \param V2
4595///    A 128-bit vector of [2 x double]. The contents of this parameter are
4596///    written to either the upper or the lower 128 bits of the result depending
4597///    on the value of parameter \a M.
4598/// \param M
4599///    An immediate integer. The least significant bit determines how the values
4600///    from the two parameters are interleaved: \n
4601///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4602///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4603///    result. \n
4604///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4605///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4606///    result.
4607/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4608#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4609  (__m256d)__builtin_shufflevector( \
4610    (__v4df)(__m256d)(V1), \
4611    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4612    (((M) & 1) ? 0 : 4), \
4613    (((M) & 1) ? 1 : 5), \
4614    (((M) & 1) ? 4 : 2), \
4615    (((M) & 1) ? 5 : 3) );})
4616
4617/// \brief Constructs a new 256-bit integer vector by first duplicating a
4618///    256-bit integer vector given in the first parameter, and then replacing
4619///    either the upper or the lower 128 bits with the contents of a 128-bit
4620///    integer vector in the second parameter. The immediate integer parameter
4621///    determines between the upper or the lower 128 bits.
4622///
4623/// \headerfile <x86intrin.h>
4624///
4625/// \code
4626/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4627/// \endcode
4628///
4629/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4630///
4631/// \param V1
4632///    A 256-bit integer vector. This vector is copied to the result first, and
4633///    then either the upper or the lower 128 bits of the result will be
4634///    replaced by the contents of \a V2.
4635/// \param V2
4636///    A 128-bit integer vector. The contents of this parameter are written to
4637///    either the upper or the lower 128 bits of the result depending on the
4638///     value of parameter \a M.
4639/// \param M
4640///    An immediate integer. The least significant bit determines how the values
4641///    from the two parameters are interleaved: \n
4642///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4643///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4644///    result. \n
4645///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4646///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4647///    result.
4648/// \returns A 256-bit integer vector containing the interleaved values.
4649#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4650  (__m256i)__builtin_shufflevector( \
4651    (__v4di)(__m256i)(V1), \
4652    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4653    (((M) & 1) ? 0 : 4), \
4654    (((M) & 1) ? 1 : 5), \
4655    (((M) & 1) ? 4 : 2), \
4656    (((M) & 1) ? 5 : 3) );})
4657
4658/*
4659   Vector extract.
4660   We use macros rather than inlines because we only want to accept
4661   invocations where the immediate M is a constant expression.
4662*/
4663/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4664///    of [8 x float], as determined by the immediate integer parameter, and
4665///    returns the extracted bits as a 128-bit vector of [4 x float].
4666///
4667/// \headerfile <x86intrin.h>
4668///
4669/// \code
4670/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4671/// \endcode
4672///
4673/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4674///
4675/// \param V
4676///    A 256-bit vector of [8 x float].
4677/// \param M
4678///    An immediate integer. The least significant bit determines which bits are
4679///    extracted from the first parameter: \n
4680///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4681///    result. \n
4682///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4683/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4684#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4685  (__m128)__builtin_shufflevector( \
4686    (__v8sf)(__m256)(V), \
4687    (__v8sf)(_mm256_undefined_ps()), \
4688    (((M) & 1) ? 4 : 0), \
4689    (((M) & 1) ? 5 : 1), \
4690    (((M) & 1) ? 6 : 2), \
4691    (((M) & 1) ? 7 : 3) );})
4692
4693/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4694///    of [4 x double], as determined by the immediate integer parameter, and
4695///    returns the extracted bits as a 128-bit vector of [2 x double].
4696///
4697/// \headerfile <x86intrin.h>
4698///
4699/// \code
4700/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4701/// \endcode
4702///
4703/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4704///
4705/// \param V
4706///    A 256-bit vector of [4 x double].
4707/// \param M
4708///    An immediate integer. The least significant bit determines which bits are
4709///    extracted from the first parameter: \n
4710///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4711///    result. \n
4712///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4713/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4714#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4715  (__m128d)__builtin_shufflevector( \
4716    (__v4df)(__m256d)(V), \
4717    (__v4df)(_mm256_undefined_pd()), \
4718    (((M) & 1) ? 2 : 0), \
4719    (((M) & 1) ? 3 : 1) );})
4720
4721/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4722///    integer vector, as determined by the immediate integer parameter, and
4723///    returns the extracted bits as a 128-bit integer vector.
4724///
4725/// \headerfile <x86intrin.h>
4726///
4727/// \code
4728/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4729/// \endcode
4730///
4731/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4732///
4733/// \param V
4734///    A 256-bit integer vector.
4735/// \param M
4736///    An immediate integer. The least significant bit determines which bits are
4737///    extracted from the first parameter:  \n
4738///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4739///    result. \n
4740///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4741/// \returns A 128-bit integer vector containing the extracted bits.
4742#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4743  (__m128i)__builtin_shufflevector( \
4744    (__v4di)(__m256i)(V), \
4745    (__v4di)(_mm256_undefined_si256()), \
4746    (((M) & 1) ? 2 : 0), \
4747    (((M) & 1) ? 3 : 1) );})
4748
4749/* SIMD load ops (unaligned) */
4750/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4751///    unaligned memory locations and constructs a 256-bit floating-point vector
4752///    of [8 x float] by concatenating the two 128-bit vectors.
4753///
4754/// \headerfile <x86intrin.h>
4755///
4756/// This intrinsic corresponds to load instructions followed by the
4757///   <c> VINSERTF128 </c> instruction.
4758///
4759/// \param __addr_hi
4760///    A pointer to a 128-bit memory location containing 4 consecutive
4761///    single-precision floating-point values. These values are to be copied to
4762///    bits[255:128] of the result. The address of the memory location does not
4763///    have to be aligned.
4764/// \param __addr_lo
4765///    A pointer to a 128-bit memory location containing 4 consecutive
4766///    single-precision floating-point values. These values are to be copied to
4767///    bits[127:0] of the result. The address of the memory location does not
4768///    have to be aligned.
4769/// \returns A 256-bit floating-point vector of [8 x float] containing the
4770///    concatenated result.
4771static __inline __m256 __DEFAULT_FN_ATTRS
4772_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4773{
4774  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4775  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4776}
4777
4778/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4779///    unaligned memory locations and constructs a 256-bit floating-point vector
4780///    of [4 x double] by concatenating the two 128-bit vectors.
4781///
4782/// \headerfile <x86intrin.h>
4783///
4784/// This intrinsic corresponds to load instructions followed by the
4785///   <c> VINSERTF128 </c> instruction.
4786///
4787/// \param __addr_hi
4788///    A pointer to a 128-bit memory location containing two consecutive
4789///    double-precision floating-point values. These values are to be copied to
4790///    bits[255:128] of the result. The address of the memory location does not
4791///    have to be aligned.
4792/// \param __addr_lo
4793///    A pointer to a 128-bit memory location containing two consecutive
4794///    double-precision floating-point values. These values are to be copied to
4795///    bits[127:0] of the result. The address of the memory location does not
4796///    have to be aligned.
4797/// \returns A 256-bit floating-point vector of [4 x double] containing the
4798///    concatenated result.
4799static __inline __m256d __DEFAULT_FN_ATTRS
4800_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4801{
4802  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4803  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4804}
4805
4806/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4807///    constructs a 256-bit integer vector by concatenating the two 128-bit
4808///    vectors.
4809///
4810/// \headerfile <x86intrin.h>
4811///
4812/// This intrinsic corresponds to load instructions followed by the
4813///   <c> VINSERTF128 </c> instruction.
4814///
4815/// \param __addr_hi
4816///    A pointer to a 128-bit memory location containing a 128-bit integer
4817///    vector. This vector is to be copied to bits[255:128] of the result. The
4818///    address of the memory location does not have to be aligned.
4819/// \param __addr_lo
4820///    A pointer to a 128-bit memory location containing a 128-bit integer
4821///    vector. This vector is to be copied to bits[127:0] of the result. The
4822///    address of the memory location does not have to be aligned.
4823/// \returns A 256-bit integer vector containing the concatenated result.
4824static __inline __m256i __DEFAULT_FN_ATTRS
4825_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
4826{
4827  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4828  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4829}
4830
4831/* SIMD store ops (unaligned) */
4832/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4833///    vector of [8 x float] into two different unaligned memory locations.
4834///
4835/// \headerfile <x86intrin.h>
4836///
4837/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4838///   store instructions.
4839///
4840/// \param __addr_hi
4841///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4842///    copied to this memory location. The address of this memory location does
4843///    not have to be aligned.
4844/// \param __addr_lo
4845///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4846///    copied to this memory location. The address of this memory location does
4847///    not have to be aligned.
4848/// \param __a
4849///    A 256-bit floating-point vector of [8 x float].
4850static __inline void __DEFAULT_FN_ATTRS
4851_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4852{
4853  __m128 __v128;
4854
4855  __v128 = _mm256_castps256_ps128(__a);
4856  _mm_storeu_ps(__addr_lo, __v128);
4857  __v128 = _mm256_extractf128_ps(__a, 1);
4858  _mm_storeu_ps(__addr_hi, __v128);
4859}
4860
4861/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4862///    vector of [4 x double] into two different unaligned memory locations.
4863///
4864/// \headerfile <x86intrin.h>
4865///
4866/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4867///   store instructions.
4868///
4869/// \param __addr_hi
4870///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4871///    copied to this memory location. The address of this memory location does
4872///    not have to be aligned.
4873/// \param __addr_lo
4874///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4875///    copied to this memory location. The address of this memory location does
4876///    not have to be aligned.
4877/// \param __a
4878///    A 256-bit floating-point vector of [4 x double].
4879static __inline void __DEFAULT_FN_ATTRS
4880_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4881{
4882  __m128d __v128;
4883
4884  __v128 = _mm256_castpd256_pd128(__a);
4885  _mm_storeu_pd(__addr_lo, __v128);
4886  __v128 = _mm256_extractf128_pd(__a, 1);
4887  _mm_storeu_pd(__addr_hi, __v128);
4888}
4889
4890/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
4891///    two different unaligned memory locations.
4892///
4893/// \headerfile <x86intrin.h>
4894///
4895/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4896///   store instructions.
4897///
4898/// \param __addr_hi
4899///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4900///    copied to this memory location. The address of this memory location does
4901///    not have to be aligned.
4902/// \param __addr_lo
4903///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4904///    copied to this memory location. The address of this memory location does
4905///    not have to be aligned.
4906/// \param __a
4907///    A 256-bit integer vector.
4908static __inline void __DEFAULT_FN_ATTRS
4909_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
4910{
4911  __m128i __v128;
4912
4913  __v128 = _mm256_castsi256_si128(__a);
4914  _mm_storeu_si128(__addr_lo, __v128);
4915  __v128 = _mm256_extractf128_si256(__a, 1);
4916  _mm_storeu_si128(__addr_hi, __v128);
4917}
4918
4919/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4920///    concatenating two 128-bit floating-point vectors of [4 x float].
4921///
4922/// \headerfile <x86intrin.h>
4923///
4924/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4925///
4926/// \param __hi
4927///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4928///    128 bits of the result.
4929/// \param __lo
4930///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4931///    128 bits of the result.
4932/// \returns A 256-bit floating-point vector of [8 x float] containing the
4933///    concatenated result.
4934static __inline __m256 __DEFAULT_FN_ATTRS
4935_mm256_set_m128 (__m128 __hi, __m128 __lo)
4936{
4937  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4938}
4939
4940/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
4941///    concatenating two 128-bit floating-point vectors of [2 x double].
4942///
4943/// \headerfile <x86intrin.h>
4944///
4945/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4946///
4947/// \param __hi
4948///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4949///    128 bits of the result.
4950/// \param __lo
4951///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4952///    128 bits of the result.
4953/// \returns A 256-bit floating-point vector of [4 x double] containing the
4954///    concatenated result.
4955static __inline __m256d __DEFAULT_FN_ATTRS
4956_mm256_set_m128d (__m128d __hi, __m128d __lo)
4957{
4958  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4959}
4960
4961/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
4962///    integer vectors.
4963///
4964/// \headerfile <x86intrin.h>
4965///
4966/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4967///
4968/// \param __hi
4969///    A 128-bit integer vector to be copied to the upper 128 bits of the
4970///    result.
4971/// \param __lo
4972///    A 128-bit integer vector to be copied to the lower 128 bits of the
4973///    result.
4974/// \returns A 256-bit integer vector containing the concatenated result.
4975static __inline __m256i __DEFAULT_FN_ATTRS
4976_mm256_set_m128i (__m128i __hi, __m128i __lo)
4977{
4978  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4979}
4980
4981/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4982///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
4983///    similar to _mm256_set_m128, but the order of the input parameters is
4984///    swapped.
4985///
4986/// \headerfile <x86intrin.h>
4987///
4988/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4989///
4990/// \param __lo
4991///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4992///    128 bits of the result.
4993/// \param __hi
4994///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4995///    128 bits of the result.
4996/// \returns A 256-bit floating-point vector of [8 x float] containing the
4997///    concatenated result.
4998static __inline __m256 __DEFAULT_FN_ATTRS
4999_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5000{
5001  return _mm256_set_m128(__hi, __lo);
5002}
5003
5004/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5005///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
5006///    similar to _mm256_set_m128d, but the order of the input parameters is
5007///    swapped.
5008///
5009/// \headerfile <x86intrin.h>
5010///
5011/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5012///
5013/// \param __lo
5014///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5015///    128 bits of the result.
5016/// \param __hi
5017///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5018///    128 bits of the result.
5019/// \returns A 256-bit floating-point vector of [4 x double] containing the
5020///    concatenated result.
5021static __inline __m256d __DEFAULT_FN_ATTRS
5022_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5023{
5024  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5025}
5026
5027/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5028///    integer vectors. This is similar to _mm256_set_m128i, but the order of
5029///    the input parameters is swapped.
5030///
5031/// \headerfile <x86intrin.h>
5032///
5033/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5034///
5035/// \param __lo
5036///    A 128-bit integer vector to be copied to the lower 128 bits of the
5037///    result.
5038/// \param __hi
5039///    A 128-bit integer vector to be copied to the upper 128 bits of the
5040///    result.
5041/// \returns A 256-bit integer vector containing the concatenated result.
5042static __inline __m256i __DEFAULT_FN_ATTRS
5043_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5044{
5045  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5046}
5047
5048#undef __DEFAULT_FN_ATTRS
5049
5050#endif /* __AVXINTRIN_H */
5051