1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#include <xmmintrin.h>
28
29typedef double __m128d __attribute__((__vector_size__(16)));
30typedef long long __m128i __attribute__((__vector_size__(16)));
31
32/* Type defines.  */
33typedef double __v2df __attribute__ ((__vector_size__ (16)));
34typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35typedef short __v8hi __attribute__((__vector_size__(16)));
36typedef char __v16qi __attribute__((__vector_size__(16)));
37
38/* Unsigned types */
39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42
43/* We need an explicitly signed variant for char. Note that this shouldn't
44 * appear in the interface though. */
45typedef signed char __v16qs __attribute__((__vector_size__(16)));
46
47#include <f16cintrin.h>
48
49/* Define the default attributes for the functions in this file. */
50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
51
52/// \brief Adds lower double-precision values in both operands and returns the
53///    sum in the lower 64 bits of the result. The upper 64 bits of the result
54///    are copied from the upper double-precision value of the first operand.
55///
56/// \headerfile <x86intrin.h>
57///
58/// This intrinsic corresponds to the \c VADDSD / ADDSD instruction.
59///
60/// \param __a
61///    A 128-bit vector of [2 x double] containing one of the source operands.
62/// \param __b
63///    A 128-bit vector of [2 x double] containing one of the source operands.
64/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
65///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
66///    from the upper 64 bits of the first source operand.
67static __inline__ __m128d __DEFAULT_FN_ATTRS
68_mm_add_sd(__m128d __a, __m128d __b)
69{
70  __a[0] += __b[0];
71  return __a;
72}
73
74/// \brief Adds two 128-bit vectors of [2 x double].
75///
76/// \headerfile <x86intrin.h>
77///
78/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
79///
80/// \param __a
81///    A 128-bit vector of [2 x double] containing one of the source operands.
82/// \param __b
83///    A 128-bit vector of [2 x double] containing one of the source operands.
84/// \returns A 128-bit vector of [2 x double] containing the sums of both
85///    operands.
86static __inline__ __m128d __DEFAULT_FN_ATTRS
87_mm_add_pd(__m128d __a, __m128d __b)
88{
89  return (__m128d)((__v2df)__a + (__v2df)__b);
90}
91
92/// \brief Subtracts the lower double-precision value of the second operand
93///    from the lower double-precision value of the first operand and returns
94///    the difference in the lower 64 bits of the result. The upper 64 bits of
95///    the result are copied from the upper double-precision value of the first
96///    operand.
97///
98/// \headerfile <x86intrin.h>
99///
100/// This intrinsic corresponds to the \c VSUBSD / SUBSD instruction.
101///
102/// \param __a
103///    A 128-bit vector of [2 x double] containing the minuend.
104/// \param __b
105///    A 128-bit vector of [2 x double] containing the subtrahend.
106/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
107///    difference of the lower 64 bits of both operands. The upper 64 bits are
108///    copied from the upper 64 bits of the first source operand.
109static __inline__ __m128d __DEFAULT_FN_ATTRS
110_mm_sub_sd(__m128d __a, __m128d __b)
111{
112  __a[0] -= __b[0];
113  return __a;
114}
115
116/// \brief Subtracts two 128-bit vectors of [2 x double].
117///
118/// \headerfile <x86intrin.h>
119///
120/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
121///
122/// \param __a
123///    A 128-bit vector of [2 x double] containing the minuend.
124/// \param __b
125///    A 128-bit vector of [2 x double] containing the subtrahend.
126/// \returns A 128-bit vector of [2 x double] containing the differences between
127///    both operands.
128static __inline__ __m128d __DEFAULT_FN_ATTRS
129_mm_sub_pd(__m128d __a, __m128d __b)
130{
131  return (__m128d)((__v2df)__a - (__v2df)__b);
132}
133
134/// \brief Multiplies lower double-precision values in both operands and returns
135///    the product in the lower 64 bits of the result. The upper 64 bits of the
136///    result are copied from the upper double-precision value of the first
137///    operand.
138///
139/// \headerfile <x86intrin.h>
140///
141/// This intrinsic corresponds to the \c VMULSD / MULSD instruction.
142///
143/// \param __a
144///    A 128-bit vector of [2 x double] containing one of the source operands.
145/// \param __b
146///    A 128-bit vector of [2 x double] containing one of the source operands.
147/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
148///    product of the lower 64 bits of both operands. The upper 64 bits are
149///    copied from the upper 64 bits of the first source operand.
150static __inline__ __m128d __DEFAULT_FN_ATTRS
151_mm_mul_sd(__m128d __a, __m128d __b)
152{
153  __a[0] *= __b[0];
154  return __a;
155}
156
157/// \brief Multiplies two 128-bit vectors of [2 x double].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
162///
163/// \param __a
164///    A 128-bit vector of [2 x double] containing one of the operands.
165/// \param __b
166///    A 128-bit vector of [2 x double] containing one of the operands.
167/// \returns A 128-bit vector of [2 x double] containing the products of both
168///    operands.
169static __inline__ __m128d __DEFAULT_FN_ATTRS
170_mm_mul_pd(__m128d __a, __m128d __b)
171{
172  return (__m128d)((__v2df)__a * (__v2df)__b);
173}
174
175/// \brief Divides the lower double-precision value of the first operand by the
176///    lower double-precision value of the second operand and returns the
177///    quotient in the lower 64 bits of the result. The upper 64 bits of the
178///    result are copied from the upper double-precision value of the first
179///    operand.
180///
181/// \headerfile <x86intrin.h>
182///
183/// This intrinsic corresponds to the \c VDIVSD / DIVSD instruction.
184///
185/// \param __a
186///    A 128-bit vector of [2 x double] containing the dividend.
187/// \param __b
188///    A 128-bit vector of [2 x double] containing divisor.
189/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
190///    quotient of the lower 64 bits of both operands. The upper 64 bits are
191///    copied from the upper 64 bits of the first source operand.
192static __inline__ __m128d __DEFAULT_FN_ATTRS
193_mm_div_sd(__m128d __a, __m128d __b)
194{
195  __a[0] /= __b[0];
196  return __a;
197}
198
199/// \brief Performs an element-by-element division of two 128-bit vectors of
200///    [2 x double].
201///
202/// \headerfile <x86intrin.h>
203///
204/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
205///
206/// \param __a
207///    A 128-bit vector of [2 x double] containing the dividend.
208/// \param __b
209///    A 128-bit vector of [2 x double] containing the divisor.
210/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211///    operands.
212static __inline__ __m128d __DEFAULT_FN_ATTRS
213_mm_div_pd(__m128d __a, __m128d __b)
214{
215  return (__m128d)((__v2df)__a / (__v2df)__b);
216}
217
218/// \brief Calculates the square root of the lower double-precision value of
219///    the second operand and returns it in the lower 64 bits of the result.
220///    The upper 64 bits of the result are copied from the upper double-
221///    precision value of the first operand.
222///
223/// \headerfile <x86intrin.h>
224///
225/// This intrinsic corresponds to the \c VSQRTSD / SQRTSD instruction.
226///
227/// \param __a
228///    A 128-bit vector of [2 x double] containing one of the operands. The
229///    upper 64 bits of this operand are copied to the upper 64 bits of the
230///    result.
231/// \param __b
232///    A 128-bit vector of [2 x double] containing one of the operands. The
233///    square root is calculated using the lower 64 bits of this operand.
234/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
235///    square root of the lower 64 bits of operand __b, and whose upper 64 bits
236///    are copied from the upper 64 bits of operand __a.
237static __inline__ __m128d __DEFAULT_FN_ATTRS
238_mm_sqrt_sd(__m128d __a, __m128d __b)
239{
240  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
241  return (__m128d) { __c[0], __a[1] };
242}
243
244/// \brief Calculates the square root of the each of two values stored in a
245///    128-bit vector of [2 x double].
246///
247/// \headerfile <x86intrin.h>
248///
249/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
250///
251/// \param __a
252///    A 128-bit vector of [2 x double].
253/// \returns A 128-bit vector of [2 x double] containing the square roots of the
254///    values in the operand.
255static __inline__ __m128d __DEFAULT_FN_ATTRS
256_mm_sqrt_pd(__m128d __a)
257{
258  return __builtin_ia32_sqrtpd((__v2df)__a);
259}
260
261/// \brief Compares lower 64-bit double-precision values of both operands, and
262///    returns the lesser of the pair of values in the lower 64-bits of the
263///    result. The upper 64 bits of the result are copied from the upper double-
264///    precision value of the first operand.
265///
266/// \headerfile <x86intrin.h>
267///
268/// This intrinsic corresponds to the \c VMINSD / MINSD instruction.
269///
270/// \param __a
271///    A 128-bit vector of [2 x double] containing one of the operands. The
272///    lower 64 bits of this operand are used in the comparison.
273/// \param __b
274///    A 128-bit vector of [2 x double] containing one of the operands. The
275///    lower 64 bits of this operand are used in the comparison.
276/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
277///    minimum value between both operands. The upper 64 bits are copied from
278///    the upper 64 bits of the first source operand.
279static __inline__ __m128d __DEFAULT_FN_ATTRS
280_mm_min_sd(__m128d __a, __m128d __b)
281{
282  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
283}
284
285/// \brief Performs element-by-element comparison of the two 128-bit vectors of
286///    [2 x double] and returns the vector containing the lesser of each pair of
287///    values.
288///
289/// \headerfile <x86intrin.h>
290///
291/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
292///
293/// \param __a
294///    A 128-bit vector of [2 x double] containing one of the operands.
295/// \param __b
296///    A 128-bit vector of [2 x double] containing one of the operands.
297/// \returns A 128-bit vector of [2 x double] containing the minimum values
298///    between both operands.
299static __inline__ __m128d __DEFAULT_FN_ATTRS
300_mm_min_pd(__m128d __a, __m128d __b)
301{
302  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
303}
304
305/// \brief Compares lower 64-bits double-precision values of both operands, and
306///    returns the greater of the pair of values in the lower 64-bits of the
307///    result. The upper 64 bits of the result are copied from the upper double-
308///    precision value of the first operand.
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the \c VMAXSD / MAXSD instruction.
313///
314/// \param __a
315///    A 128-bit vector of [2 x double] containing one of the operands. The
316///    lower 64 bits of this operand are used in the comparison.
317/// \param __b
318///    A 128-bit vector of [2 x double] containing one of the operands. The
319///    lower 64 bits of this operand are used in the comparison.
320/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321///    maximum value between both operands. The upper 64 bits are copied from
322///    the upper 64 bits of the first source operand.
323static __inline__ __m128d __DEFAULT_FN_ATTRS
324_mm_max_sd(__m128d __a, __m128d __b)
325{
326  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
327}
328
329/// \brief Performs element-by-element comparison of the two 128-bit vectors of
330///    [2 x double] and returns the vector containing the greater of each pair
331///    of values.
332///
333/// \headerfile <x86intrin.h>
334///
335/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
336///
337/// \param __a
338///    A 128-bit vector of [2 x double] containing one of the operands.
339/// \param __b
340///    A 128-bit vector of [2 x double] containing one of the operands.
341/// \returns A 128-bit vector of [2 x double] containing the maximum values
342///    between both operands.
343static __inline__ __m128d __DEFAULT_FN_ATTRS
344_mm_max_pd(__m128d __a, __m128d __b)
345{
346  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
347}
348
349/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the \c VPAND / PAND instruction.
354///
355/// \param __a
356///    A 128-bit vector of [2 x double] containing one of the source operands.
357/// \param __b
358///    A 128-bit vector of [2 x double] containing one of the source operands.
359/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360///    values between both operands.
361static __inline__ __m128d __DEFAULT_FN_ATTRS
362_mm_and_pd(__m128d __a, __m128d __b)
363{
364  return (__m128d)((__v2du)__a & (__v2du)__b);
365}
366
367/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
368///    the one's complement of the values contained in the first source operand.
369///
370/// \headerfile <x86intrin.h>
371///
372/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
373///
374/// \param __a
375///    A 128-bit vector of [2 x double] containing the left source operand. The
376///    one's complement of this value is used in the bitwise AND.
377/// \param __b
378///    A 128-bit vector of [2 x double] containing the right source operand.
379/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
380///    values in the second operand and the one's complement of the first
381///    operand.
382static __inline__ __m128d __DEFAULT_FN_ATTRS
383_mm_andnot_pd(__m128d __a, __m128d __b)
384{
385  return (__m128d)(~(__v2du)__a & (__v2du)__b);
386}
387
388/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
389///
390/// \headerfile <x86intrin.h>
391///
392/// This intrinsic corresponds to the \c VPOR / POR instruction.
393///
394/// \param __a
395///    A 128-bit vector of [2 x double] containing one of the source operands.
396/// \param __b
397///    A 128-bit vector of [2 x double] containing one of the source operands.
398/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
399///    values between both operands.
400static __inline__ __m128d __DEFAULT_FN_ATTRS
401_mm_or_pd(__m128d __a, __m128d __b)
402{
403  return (__m128d)((__v2du)__a | (__v2du)__b);
404}
405
406/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
407///
408/// \headerfile <x86intrin.h>
409///
410/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
411///
412/// \param __a
413///    A 128-bit vector of [2 x double] containing one of the source operands.
414/// \param __b
415///    A 128-bit vector of [2 x double] containing one of the source operands.
416/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
417///    values between both operands.
418static __inline__ __m128d __DEFAULT_FN_ATTRS
419_mm_xor_pd(__m128d __a, __m128d __b)
420{
421  return (__m128d)((__v2du)__a ^ (__v2du)__b);
422}
423
424/// \brief Compares each of the corresponding double-precision values of the
425///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
426///    for false, FFFFFFFFFFFFFFFFh for true.
427///
428/// \headerfile <x86intrin.h>
429///
430/// This intrinsic corresponds to the \c VCMPEQPD / CMPEQPD instruction.
431///
432/// \param __a
433///    A 128-bit vector of [2 x double].
434/// \param __b
435///    A 128-bit vector of [2 x double].
436/// \returns A 128-bit vector containing the comparison results.
437static __inline__ __m128d __DEFAULT_FN_ATTRS
438_mm_cmpeq_pd(__m128d __a, __m128d __b)
439{
440  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
441}
442
443/// \brief Compares each of the corresponding double-precision values of the
444///    128-bit vectors of [2 x double] to determine if the values in the first
445///    operand are less than those in the second operand. Each comparison
446///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
447///
448/// \headerfile <x86intrin.h>
449///
450/// This intrinsic corresponds to the \c VCMPLTPD / CMPLTPD instruction.
451///
452/// \param __a
453///    A 128-bit vector of [2 x double].
454/// \param __b
455///    A 128-bit vector of [2 x double].
456/// \returns A 128-bit vector containing the comparison results.
457static __inline__ __m128d __DEFAULT_FN_ATTRS
458_mm_cmplt_pd(__m128d __a, __m128d __b)
459{
460  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
461}
462
463/// \brief Compares each of the corresponding double-precision values of the
464///    128-bit vectors of [2 x double] to determine if the values in the first
465///    operand are less than or equal to those in the second operand. Each
466///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
467///
468/// \headerfile <x86intrin.h>
469///
470/// This intrinsic corresponds to the \c VCMPLEPD / CMPLEPD instruction.
471///
472/// \param __a
473///    A 128-bit vector of [2 x double].
474/// \param __b
475///    A 128-bit vector of [2 x double].
476/// \returns A 128-bit vector containing the comparison results.
477static __inline__ __m128d __DEFAULT_FN_ATTRS
478_mm_cmple_pd(__m128d __a, __m128d __b)
479{
480  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
481}
482
483/// \brief Compares each of the corresponding double-precision values of the
484///    128-bit vectors of [2 x double] to determine if the values in the first
485///    operand are greater than those in the second operand. Each comparison
486///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
487///
488/// \headerfile <x86intrin.h>
489///
490/// This intrinsic corresponds to the \c VCMPLTPD / CMPLTPD instruction.
491///
492/// \param __a
493///    A 128-bit vector of [2 x double].
494/// \param __b
495///    A 128-bit vector of [2 x double].
496/// \returns A 128-bit vector containing the comparison results.
497static __inline__ __m128d __DEFAULT_FN_ATTRS
498_mm_cmpgt_pd(__m128d __a, __m128d __b)
499{
500  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
501}
502
503/// \brief Compares each of the corresponding double-precision values of the
504///    128-bit vectors of [2 x double] to determine if the values in the first
505///    operand are greater than or equal to those in the second operand. Each
506///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
507///
508/// \headerfile <x86intrin.h>
509///
510/// This intrinsic corresponds to the \c VCMPLEPD / CMPLEPD instruction.
511///
512/// \param __a
513///    A 128-bit vector of [2 x double].
514/// \param __b
515///    A 128-bit vector of [2 x double].
516/// \returns A 128-bit vector containing the comparison results.
517static __inline__ __m128d __DEFAULT_FN_ATTRS
518_mm_cmpge_pd(__m128d __a, __m128d __b)
519{
520  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
521}
522
523/// \brief Compares each of the corresponding double-precision values of the
524///    128-bit vectors of [2 x double] to determine if the values in the first
525///    operand are ordered with respect to those in the second operand. A pair
526///    of double-precision values are "ordered" with respect to each other if
527///    neither value is a NaN. Each comparison yields 0h for false,
528///    FFFFFFFFFFFFFFFFh for true.
529///
530/// \headerfile <x86intrin.h>
531///
532/// This intrinsic corresponds to the \c VCMPORDPD / CMPORDPD instruction.
533///
534/// \param __a
535///    A 128-bit vector of [2 x double].
536/// \param __b
537///    A 128-bit vector of [2 x double].
538/// \returns A 128-bit vector containing the comparison results.
539static __inline__ __m128d __DEFAULT_FN_ATTRS
540_mm_cmpord_pd(__m128d __a, __m128d __b)
541{
542  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
543}
544
545/// \brief Compares each of the corresponding double-precision values of the
546///    128-bit vectors of [2 x double] to determine if the values in the first
547///    operand are unordered with respect to those in the second operand. A pair
548///    of double-precision values are "unordered" with respect to each other if
549///    one or both values are NaN. Each comparison yields 0h for false,
550///    FFFFFFFFFFFFFFFFh for true.
551///
552/// \headerfile <x86intrin.h>
553///
554/// This intrinsic corresponds to the \c VCMPUNORDPD / CMPUNORDPD instruction.
555///
556/// \param __a
557///    A 128-bit vector of [2 x double].
558/// \param __b
559///    A 128-bit vector of [2 x double].
560/// \returns A 128-bit vector containing the comparison results.
561static __inline__ __m128d __DEFAULT_FN_ATTRS
562_mm_cmpunord_pd(__m128d __a, __m128d __b)
563{
564  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
565}
566
567/// \brief Compares each of the corresponding double-precision values of the
568///    128-bit vectors of [2 x double] to determine if the values in the first
569///    operand are unequal to those in the second operand. Each comparison
570///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
571///
572/// \headerfile <x86intrin.h>
573///
574/// This intrinsic corresponds to the \c VCMPNEQPD / CMPNEQPD instruction.
575///
576/// \param __a
577///    A 128-bit vector of [2 x double].
578/// \param __b
579///    A 128-bit vector of [2 x double].
580/// \returns A 128-bit vector containing the comparison results.
581static __inline__ __m128d __DEFAULT_FN_ATTRS
582_mm_cmpneq_pd(__m128d __a, __m128d __b)
583{
584  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
585}
586
587/// \brief Compares each of the corresponding double-precision values of the
588///    128-bit vectors of [2 x double] to determine if the values in the first
589///    operand are not less than those in the second operand. Each comparison
590///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
591///
592/// \headerfile <x86intrin.h>
593///
594/// This intrinsic corresponds to the \c VCMPNLTPD / CMPNLTPD instruction.
595///
596/// \param __a
597///    A 128-bit vector of [2 x double].
598/// \param __b
599///    A 128-bit vector of [2 x double].
600/// \returns A 128-bit vector containing the comparison results.
601static __inline__ __m128d __DEFAULT_FN_ATTRS
602_mm_cmpnlt_pd(__m128d __a, __m128d __b)
603{
604  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
605}
606
607/// \brief Compares each of the corresponding double-precision values of the
608///    128-bit vectors of [2 x double] to determine if the values in the first
609///    operand are not less than or equal to those in the second operand. Each
610///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
611///
612/// \headerfile <x86intrin.h>
613///
614/// This intrinsic corresponds to the \c VCMPNLEPD / CMPNLEPD instruction.
615///
616/// \param __a
617///    A 128-bit vector of [2 x double].
618/// \param __b
619///    A 128-bit vector of [2 x double].
620/// \returns A 128-bit vector containing the comparison results.
621static __inline__ __m128d __DEFAULT_FN_ATTRS
622_mm_cmpnle_pd(__m128d __a, __m128d __b)
623{
624  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
625}
626
627/// \brief Compares each of the corresponding double-precision values of the
628///    128-bit vectors of [2 x double] to determine if the values in the first
629///    operand are not greater than those in the second operand. Each
630///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the \c VCMPNLTPD / CMPNLTPD instruction.
635///
636/// \param __a
637///    A 128-bit vector of [2 x double].
638/// \param __b
639///    A 128-bit vector of [2 x double].
640/// \returns A 128-bit vector containing the comparison results.
641static __inline__ __m128d __DEFAULT_FN_ATTRS
642_mm_cmpngt_pd(__m128d __a, __m128d __b)
643{
644  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
645}
646
647/// \brief Compares each of the corresponding double-precision values of the
648///    128-bit vectors of [2 x double] to determine if the values in the first
649///    operand are not greater than or equal to those in the second operand.
650///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
651///
652/// \headerfile <x86intrin.h>
653///
654/// This intrinsic corresponds to the \c VCMPNLEPD / CMPNLEPD instruction.
655///
656/// \param __a
657///    A 128-bit vector of [2 x double].
658/// \param __b
659///    A 128-bit vector of [2 x double].
660/// \returns A 128-bit vector containing the comparison results.
661static __inline__ __m128d __DEFAULT_FN_ATTRS
662_mm_cmpnge_pd(__m128d __a, __m128d __b)
663{
664  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
665}
666
667/// \brief Compares the lower double-precision floating-point values in each of
668///    the two 128-bit floating-point vectors of [2 x double] for equality. The
669///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
670///
671/// \headerfile <x86intrin.h>
672///
673/// This intrinsic corresponds to the \c VCMPEQSD / CMPEQSD instruction.
674///
675/// \param __a
676///    A 128-bit vector of [2 x double]. The lower double-precision value is
677///    compared to the lower double-precision value of __b.
678/// \param __b
679///    A 128-bit vector of [2 x double]. The lower double-precision value is
680///    compared to the lower double-precision value of __a.
681/// \returns A 128-bit vector. The lower 64 bits contains the comparison
682///    results. The upper 64 bits are copied from the upper 64 bits of __a.
683static __inline__ __m128d __DEFAULT_FN_ATTRS
684_mm_cmpeq_sd(__m128d __a, __m128d __b)
685{
686  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
687}
688
689/// \brief Compares the lower double-precision floating-point values in each of
690///    the two 128-bit floating-point vectors of [2 x double] to determine if
691///    the value in the first parameter is less than the corresponding value in
692///    the second parameter. The comparison yields 0h for false,
693///    FFFFFFFFFFFFFFFFh for true.
694///
695/// \headerfile <x86intrin.h>
696///
697/// This intrinsic corresponds to the \c VCMPLTSD / CMPLTSD instruction.
698///
699/// \param __a
700///    A 128-bit vector of [2 x double]. The lower double-precision value is
701///    compared to the lower double-precision value of __b.
702/// \param __b
703///    A 128-bit vector of [2 x double]. The lower double-precision value is
704///    compared to the lower double-precision value of __a.
705/// \returns A 128-bit vector. The lower 64 bits contains the comparison
706///    results. The upper 64 bits are copied from the upper 64 bits of __a.
707static __inline__ __m128d __DEFAULT_FN_ATTRS
708_mm_cmplt_sd(__m128d __a, __m128d __b)
709{
710  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
711}
712
713/// \brief Compares the lower double-precision floating-point values in each of
714///    the two 128-bit floating-point vectors of [2 x double] to determine if
715///    the value in the first parameter is less than or equal to the
716///    corresponding value in the second parameter. The comparison yields 0h for
717///    false, FFFFFFFFFFFFFFFFh for true.
718///
719/// \headerfile <x86intrin.h>
720///
721/// This intrinsic corresponds to the \c VCMPLESD / CMPLESD instruction.
722///
723/// \param __a
724///    A 128-bit vector of [2 x double]. The lower double-precision value is
725///    compared to the lower double-precision value of __b.
726/// \param __b
727///    A 128-bit vector of [2 x double]. The lower double-precision value is
728///    compared to the lower double-precision value of __a.
729/// \returns A 128-bit vector. The lower 64 bits contains the comparison
730///    results. The upper 64 bits are copied from the upper 64 bits of __a.
731static __inline__ __m128d __DEFAULT_FN_ATTRS
732_mm_cmple_sd(__m128d __a, __m128d __b)
733{
734  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
735}
736
737/// \brief  Compares the lower double-precision floating-point values in each
738///    of the two 128-bit floating-point vectors of [2 x double] to determine
739///    if the value in the first parameter is greater than the corresponding
740///    value in the second parameter. The comparison yields 0h for false,
741///    FFFFFFFFFFFFFFFFh for true.
742///
743/// \headerfile <x86intrin.h>
744///
745/// This intrinsic corresponds to the \c VCMPLTSD / CMPLTSD instruction.
746///
747/// \param __a
748///     A 128-bit vector of [2 x double]. The lower double-precision value is
749///     compared to the lower double-precision value of __b.
750/// \param __b
751///     A 128-bit vector of [2 x double]. The lower double-precision value is
752///     compared to the lower double-precision value of __a.
753/// \returns A 128-bit vector. The lower 64 bits contains the comparison
754///     results. The upper 64 bits are copied from the upper 64 bits of __a.
755static __inline__ __m128d __DEFAULT_FN_ATTRS
756_mm_cmpgt_sd(__m128d __a, __m128d __b)
757{
758  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
759  return (__m128d) { __c[0], __a[1] };
760}
761
762/// \brief Compares the lower double-precision floating-point values in each of
763///    the two 128-bit floating-point vectors of [2 x double] to determine if
764///    the value in the first parameter is greater than or equal to the
765///    corresponding value in the second parameter. The comparison yields 0h for
766///    false, FFFFFFFFFFFFFFFFh for true.
767///
768/// \headerfile <x86intrin.h>
769///
770/// This intrinsic corresponds to the \c VCMPLESD / CMPLESD instruction.
771///
772/// \param __a
773///    A 128-bit vector of [2 x double]. The lower double-precision value is
774///    compared to the lower double-precision value of __b.
775/// \param __b
776///    A 128-bit vector of [2 x double]. The lower double-precision value is
777///    compared to the lower double-precision value of __a.
778/// \returns A 128-bit vector. The lower 64 bits contains the comparison
779///    results. The upper 64 bits are copied from the upper 64 bits of __a.
780static __inline__ __m128d __DEFAULT_FN_ATTRS
781_mm_cmpge_sd(__m128d __a, __m128d __b)
782{
783  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
784  return (__m128d) { __c[0], __a[1] };
785}
786
787/// \brief  Compares the lower double-precision floating-point values in each
788///    of the two 128-bit floating-point vectors of [2 x double] to determine
789///    if the value in the first parameter is "ordered" with respect to the
790///    corresponding value in the second parameter. The comparison yields 0h for
791///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
792///    "ordered" with respect to each other if neither value is a NaN.
793///
794/// \headerfile <x86intrin.h>
795///
796/// This intrinsic corresponds to the \c VCMPORDSD / CMPORDSD instruction.
797///
798/// \param __a
799///    A 128-bit vector of [2 x double]. The lower double-precision value is
800///    compared to the lower double-precision value of __b.
801/// \param __b
802///    A 128-bit vector of [2 x double]. The lower double-precision value is
803///    compared to the lower double-precision value of __a.
804/// \returns A 128-bit vector. The lower 64 bits contains the comparison
805///    results. The upper 64 bits are copied from the upper 64 bits of __a.
806static __inline__ __m128d __DEFAULT_FN_ATTRS
807_mm_cmpord_sd(__m128d __a, __m128d __b)
808{
809  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
810}
811
812/// \brief  Compares the lower double-precision floating-point values in each
813///    of the two 128-bit floating-point vectors of [2 x double] to determine
814///    if the value in the first parameter is "unordered" with respect to the
815///    corresponding value in the second parameter. The comparison yields 0h
816///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
817///    are "unordered" with respect to each other if one or both values are NaN.
818///
819/// \headerfile <x86intrin.h>
820///
821/// This intrinsic corresponds to the \c VCMPUNORDSD / CMPUNORDSD instruction.
822///
823/// \param __a
824///    A 128-bit vector of [2 x double]. The lower double-precision value is
825///    compared to the lower double-precision value of __b.
826/// \param __b
827///    A 128-bit vector of [2 x double]. The lower double-precision value is
828///    compared to the lower double-precision value of __a.
829/// \returns A 128-bit vector. The lower 64 bits contains the comparison
830///    results. The upper 64 bits are copied from the upper 64 bits of __a.
831static __inline__ __m128d __DEFAULT_FN_ATTRS
832_mm_cmpunord_sd(__m128d __a, __m128d __b)
833{
834  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
835}
836
837/// \brief Compares the lower double-precision floating-point values in each of
838///    the two 128-bit floating-point vectors of [2 x double] to determine if
839///    the value in the first parameter is unequal to the corresponding value in
840///    the second parameter. The comparison yields 0h for false,
841///    FFFFFFFFFFFFFFFFh for true.
842///
843/// \headerfile <x86intrin.h>
844///
845/// This intrinsic corresponds to the \c VCMPNEQSD / CMPNEQSD instruction.
846///
847/// \param __a
848///    A 128-bit vector of [2 x double]. The lower double-precision value is
849///    compared to the lower double-precision value of __b.
850/// \param __b
851///    A 128-bit vector of [2 x double]. The lower double-precision value is
852///    compared to the lower double-precision value of __a.
853/// \returns A 128-bit vector. The lower 64 bits contains the comparison
854///    results. The upper 64 bits are copied from the upper 64 bits of __a.
855static __inline__ __m128d __DEFAULT_FN_ATTRS
856_mm_cmpneq_sd(__m128d __a, __m128d __b)
857{
858  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
859}
860
861/// \brief Compares the lower double-precision floating-point values in each of
862///    the two 128-bit floating-point vectors of [2 x double] to determine if
863///    the value in the first parameter is not less than the corresponding
864///    value in the second parameter. The comparison yields 0h for false,
865///    FFFFFFFFFFFFFFFFh for true.
866///
867/// \headerfile <x86intrin.h>
868///
869/// This intrinsic corresponds to the \c VCMPNLTSD / CMPNLTSD instruction.
870///
871/// \param __a
872///    A 128-bit vector of [2 x double]. The lower double-precision value is
873///    compared to the lower double-precision value of __b.
874/// \param __b
875///    A 128-bit vector of [2 x double]. The lower double-precision value is
876///    compared to the lower double-precision value of __a.
877/// \returns A 128-bit vector. The lower 64 bits contains the comparison
878///    results. The upper 64 bits are copied from the upper 64 bits of __a.
879static __inline__ __m128d __DEFAULT_FN_ATTRS
880_mm_cmpnlt_sd(__m128d __a, __m128d __b)
881{
882  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
883}
884
885/// \brief Compares the lower double-precision floating-point values in each of
886///    the two 128-bit floating-point vectors of [2 x double] to determine if
887///    the value in the first parameter is not less than or equal to the
888///    corresponding value in the second parameter. The comparison yields 0h
889///    for false, FFFFFFFFFFFFFFFFh for true.
890///
891/// \headerfile <x86intrin.h>
892///
893/// This intrinsic corresponds to the \c VCMPNLESD / CMPNLESD instruction.
894///
895/// \param __a
896///    A 128-bit vector of [2 x double]. The lower double-precision value is
897///    compared to the lower double-precision value of __b.
898/// \param __b
899///    A 128-bit vector of [2 x double]. The lower double-precision value is
900///    compared to the lower double-precision value of __a.
901/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
902///    results. The upper 64 bits are copied from the upper 64 bits of __a.
903static __inline__ __m128d __DEFAULT_FN_ATTRS
904_mm_cmpnle_sd(__m128d __a, __m128d __b)
905{
906  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
907}
908
909/// \brief Compares the lower double-precision floating-point values in each of
910///    the two 128-bit floating-point vectors of [2 x double] to determine if
911///    the value in the first parameter is not greater than the corresponding
912///    value in the second parameter. The comparison yields 0h for false,
913///    FFFFFFFFFFFFFFFFh for true.
914///
915/// \headerfile <x86intrin.h>
916///
917/// This intrinsic corresponds to the \c VCMPNLTSD / CMPNLTSD instruction.
918///
919/// \param __a
920///    A 128-bit vector of [2 x double]. The lower double-precision value is
921///    compared to the lower double-precision value of __b.
922/// \param __b
923///    A 128-bit vector of [2 x double]. The lower double-precision value is
924///    compared to the lower double-precision value of __a.
925/// \returns A 128-bit vector. The lower 64 bits contains the comparison
926///    results. The upper 64 bits are copied from the upper 64 bits of __a.
927static __inline__ __m128d __DEFAULT_FN_ATTRS
928_mm_cmpngt_sd(__m128d __a, __m128d __b)
929{
930  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
931  return (__m128d) { __c[0], __a[1] };
932}
933
934/// \brief Compares the lower double-precision floating-point values in each of
935///    the two 128-bit floating-point vectors of [2 x double] to determine if
936///    the value in the first parameter is not greater than or equal to the
937///    corresponding value in the second parameter. The comparison yields 0h
938///    for false, FFFFFFFFFFFFFFFFh for true.
939///
940/// \headerfile <x86intrin.h>
941///
942/// This intrinsic corresponds to the \c VCMPNLESD / CMPNLESD instruction.
943///
944/// \param __a
945///    A 128-bit vector of [2 x double]. The lower double-precision value is
946///    compared to the lower double-precision value of __b.
947/// \param __b
948///    A 128-bit vector of [2 x double]. The lower double-precision value is
949///    compared to the lower double-precision value of __a.
950/// \returns A 128-bit vector. The lower 64 bits contains the comparison
951///    results. The upper 64 bits are copied from the upper 64 bits of __a.
952static __inline__ __m128d __DEFAULT_FN_ATTRS
953_mm_cmpnge_sd(__m128d __a, __m128d __b)
954{
955  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
956  return (__m128d) { __c[0], __a[1] };
957}
958
959/// \brief Compares the lower double-precision floating-point values in each of
960///    the two 128-bit floating-point vectors of [2 x double] for equality. The
961///    comparison yields 0 for false, 1 for true.
962///
963/// \headerfile <x86intrin.h>
964///
965/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
966///
967/// \param __a
968///    A 128-bit vector of [2 x double]. The lower double-precision value is
969///    compared to the lower double-precision value of __b.
970/// \param __b
971///    A 128-bit vector of [2 x double]. The lower double-precision value is
972///    compared to the lower double-precision value of __a.
973/// \returns An integer containing the comparison results.
974static __inline__ int __DEFAULT_FN_ATTRS
975_mm_comieq_sd(__m128d __a, __m128d __b)
976{
977  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
978}
979
980/// \brief Compares the lower double-precision floating-point values in each of
981///    the two 128-bit floating-point vectors of [2 x double] to determine if
982///    the value in the first parameter is less than the corresponding value in
983///    the second parameter. The comparison yields 0 for false, 1 for true.
984///
985/// \headerfile <x86intrin.h>
986///
987/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
988///
989/// \param __a
990///    A 128-bit vector of [2 x double]. The lower double-precision value is
991///    compared to the lower double-precision value of __b.
992/// \param __b
993///    A 128-bit vector of [2 x double]. The lower double-precision value is
994///    compared to the lower double-precision value of __a.
995/// \returns An integer containing the comparison results.
996static __inline__ int __DEFAULT_FN_ATTRS
997_mm_comilt_sd(__m128d __a, __m128d __b)
998{
999  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1000}
1001
1002/// \brief Compares the lower double-precision floating-point values in each of
1003///    the two 128-bit floating-point vectors of [2 x double] to determine if
1004///    the value in the first parameter is less than or equal to the
1005///    corresponding value in the second parameter. The comparison yields 0 for
1006///    false, 1 for true.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
1011///
1012/// \param __a
1013///    A 128-bit vector of [2 x double]. The lower double-precision value is
1014///    compared to the lower double-precision value of __b.
1015/// \param __b
1016///     A 128-bit vector of [2 x double]. The lower double-precision value is
1017///     compared to the lower double-precision value of __a.
1018/// \returns An integer containing the comparison results.
1019static __inline__ int __DEFAULT_FN_ATTRS
1020_mm_comile_sd(__m128d __a, __m128d __b)
1021{
1022  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1023}
1024
1025/// \brief Compares the lower double-precision floating-point values in each of
1026///    the two 128-bit floating-point vectors of [2 x double] to determine if
1027///    the value in the first parameter is greater than the corresponding value
1028///    in the second parameter. The comparison yields 0 for false, 1 for true.
1029///
1030/// \headerfile <x86intrin.h>
1031///
1032/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
1033///
1034/// \param __a
1035///    A 128-bit vector of [2 x double]. The lower double-precision value is
1036///    compared to the lower double-precision value of __b.
1037/// \param __b
1038///    A 128-bit vector of [2 x double]. The lower double-precision value is
1039///    compared to the lower double-precision value of __a.
1040/// \returns An integer containing the comparison results.
1041static __inline__ int __DEFAULT_FN_ATTRS
1042_mm_comigt_sd(__m128d __a, __m128d __b)
1043{
1044  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045}
1046
1047/// \brief Compares the lower double-precision floating-point values in each of
1048///    the two 128-bit floating-point vectors of [2 x double] to determine if
1049///    the value in the first parameter is greater than or equal to the
1050///    corresponding value in the second parameter. The comparison yields 0 for
1051///    false, 1 for true.
1052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
1056///
1057/// \param __a
1058///    A 128-bit vector of [2 x double]. The lower double-precision value is
1059///    compared to the lower double-precision value of __b.
1060/// \param __b
1061///    A 128-bit vector of [2 x double]. The lower double-precision value is
1062///    compared to the lower double-precision value of __a.
1063/// \returns An integer containing the comparison results.
1064static __inline__ int __DEFAULT_FN_ATTRS
1065_mm_comige_sd(__m128d __a, __m128d __b)
1066{
1067  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1068}
1069
1070/// \brief Compares the lower double-precision floating-point values in each of
1071///    the two 128-bit floating-point vectors of [2 x double] to determine if
1072///    the value in the first parameter is unequal to the corresponding value in
1073///    the second parameter. The comparison yields 0 for false, 1 for true.
1074///
1075/// \headerfile <x86intrin.h>
1076///
1077/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction.
1078///
1079/// \param __a
1080///    A 128-bit vector of [2 x double]. The lower double-precision value is
1081///    compared to the lower double-precision value of __b.
1082/// \param __b
1083///    A 128-bit vector of [2 x double]. The lower double-precision value is
1084///    compared to the lower double-precision value of __a.
1085/// \returns An integer containing the comparison results.
1086static __inline__ int __DEFAULT_FN_ATTRS
1087_mm_comineq_sd(__m128d __a, __m128d __b)
1088{
1089  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1090}
1091
1092/// \brief Compares the lower double-precision floating-point values in each of
1093///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1094///    comparison yields 0 for false, 1 for true. If either of the two lower
1095///    double-precision values is NaN, 1 is returned.
1096///
1097/// \headerfile <x86intrin.h>
1098///
1099/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1100///
1101/// \param __a
1102///    A 128-bit vector of [2 x double]. The lower double-precision value is
1103///    compared to the lower double-precision value of __b.
1104/// \param __b
1105///    A 128-bit vector of [2 x double]. The lower double-precision value is
1106///    compared to the lower double-precision value of __a.
1107/// \returns An integer containing the comparison results. If either of the two
1108///    lower double-precision values is NaN, 1 is returned.
1109static __inline__ int __DEFAULT_FN_ATTRS
1110_mm_ucomieq_sd(__m128d __a, __m128d __b)
1111{
1112  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1113}
1114
1115/// \brief Compares the lower double-precision floating-point values in each of
1116///    the two 128-bit floating-point vectors of [2 x double] to determine if
1117///    the value in the first parameter is less than the corresponding value in
1118///    the second parameter. The comparison yields 0 for false, 1 for true.
1119///    If either of the two lower double-precision values is NaN, 1 is returned.
1120///
1121/// \headerfile <x86intrin.h>
1122///
1123/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1124///
1125/// \param __a
1126///    A 128-bit vector of [2 x double]. The lower double-precision value is
1127///    compared to the lower double-precision value of __b.
1128/// \param __b
1129///    A 128-bit vector of [2 x double]. The lower double-precision value is
1130///    compared to the lower double-precision value of __a.
1131/// \returns An integer containing the comparison results. If either of the two
1132///    lower double-precision values is NaN, 1 is returned.
1133static __inline__ int __DEFAULT_FN_ATTRS
1134_mm_ucomilt_sd(__m128d __a, __m128d __b)
1135{
1136  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1137}
1138
1139/// \brief Compares the lower double-precision floating-point values in each of
1140///    the two 128-bit floating-point vectors of [2 x double] to determine if
1141///    the value in the first parameter is less than or equal to the
1142///    corresponding value in the second parameter. The comparison yields 0 for
1143///    false, 1 for true. If either of the two lower double-precision values is
1144///    NaN, 1 is returned.
1145///
1146/// \headerfile <x86intrin.h>
1147///
1148/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1149///
1150/// \param __a
1151///    A 128-bit vector of [2 x double]. The lower double-precision value is
1152///    compared to the lower double-precision value of __b.
1153/// \param __b
1154///     A 128-bit vector of [2 x double]. The lower double-precision value is
1155///     compared to the lower double-precision value of __a.
1156/// \returns An integer containing the comparison results. If either of the two
1157///     lower double-precision values is NaN, 1 is returned.
1158static __inline__ int __DEFAULT_FN_ATTRS
1159_mm_ucomile_sd(__m128d __a, __m128d __b)
1160{
1161  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1162}
1163
1164/// \brief Compares the lower double-precision floating-point values in each of
1165///    the two 128-bit floating-point vectors of [2 x double] to determine if
1166///    the value in the first parameter is greater than the corresponding value
1167///    in the second parameter. The comparison yields 0 for false, 1 for true.
1168///    If either of the two lower double-precision values is NaN, 0 is returned.
1169///
1170/// \headerfile <x86intrin.h>
1171///
1172/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1173///
1174/// \param __a
1175///    A 128-bit vector of [2 x double]. The lower double-precision value is
1176///    compared to the lower double-precision value of __b.
1177/// \param __b
1178///     A 128-bit vector of [2 x double]. The lower double-precision value is
1179///     compared to the lower double-precision value of __a.
1180/// \returns An integer containing the comparison results. If either of the two
1181///     lower double-precision values is NaN, 0 is returned.
1182static __inline__ int __DEFAULT_FN_ATTRS
1183_mm_ucomigt_sd(__m128d __a, __m128d __b)
1184{
1185  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1186}
1187
1188/// \brief Compares the lower double-precision floating-point values in each of
1189///    the two 128-bit floating-point vectors of [2 x double] to determine if
1190///    the value in the first parameter is greater than or equal to the
1191///    corresponding value in the second parameter. The comparison yields 0 for
1192///    false, 1 for true.  If either of the two lower double-precision values
1193///    is NaN, 0 is returned.
1194///
1195/// \headerfile <x86intrin.h>
1196///
1197/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1198///
1199/// \param __a
1200///    A 128-bit vector of [2 x double]. The lower double-precision value is
1201///    compared to the lower double-precision value of __b.
1202/// \param __b
1203///    A 128-bit vector of [2 x double]. The lower double-precision value is
1204///    compared to the lower double-precision value of __a.
1205/// \returns An integer containing the comparison results. If either of the two
1206///    lower double-precision values is NaN, 0 is returned.
1207static __inline__ int __DEFAULT_FN_ATTRS
1208_mm_ucomige_sd(__m128d __a, __m128d __b)
1209{
1210  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1211}
1212
1213/// \brief Compares the lower double-precision floating-point values in each of
1214///    the two 128-bit floating-point vectors of [2 x double] to determine if
1215///    the value in the first parameter is unequal to the corresponding value in
1216///    the second parameter. The comparison yields 0 for false, 1 for true. If
1217///    either of the two lower double-precision values is NaN, 0 is returned.
1218///
1219/// \headerfile <x86intrin.h>
1220///
1221/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction.
1222///
1223/// \param __a
1224///    A 128-bit vector of [2 x double]. The lower double-precision value is
1225///    compared to the lower double-precision value of __b.
1226/// \param __b
1227///    A 128-bit vector of [2 x double]. The lower double-precision value is
1228///    compared to the lower double-precision value of __a.
1229/// \returns An integer containing the comparison result. If either of the two
1230///    lower double-precision values is NaN, 0 is returned.
1231static __inline__ int __DEFAULT_FN_ATTRS
1232_mm_ucomineq_sd(__m128d __a, __m128d __b)
1233{
1234  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1235}
1236
1237/// \brief Converts the two double-precision floating-point elements of a
1238///    128-bit vector of [2 x double] into two single-precision floating-point
1239///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1240///    The upper 64 bits of the result vector are set to zero.
1241///
1242/// \headerfile <x86intrin.h>
1243///
1244/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
1245///
1246/// \param __a
1247///    A 128-bit vector of [2 x double].
1248/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1249///    converted values. The upper 64 bits are set to zero.
1250static __inline__ __m128 __DEFAULT_FN_ATTRS
1251_mm_cvtpd_ps(__m128d __a)
1252{
1253  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1254}
1255
1256/// \brief Converts the lower two single-precision floating-point elements of a
1257///    128-bit vector of [4 x float] into two double-precision floating-point
1258///    values, returned in a 128-bit vector of [2 x double]. The upper two
1259///    elements of the input vector are unused.
1260///
1261/// \headerfile <x86intrin.h>
1262///
1263/// This intrinsic corresponds to the \c VCVTPS2PD / CVTPS2PD instruction.
1264///
1265/// \param __a
1266///    A 128-bit vector of [4 x float]. The lower two single-precision
1267///    floating-point elements are converted to double-precision values. The
1268///    upper two elements are unused.
1269/// \returns A 128-bit vector of [2 x double] containing the converted values.
1270static __inline__ __m128d __DEFAULT_FN_ATTRS
1271_mm_cvtps_pd(__m128 __a)
1272{
1273  return (__m128d) __builtin_convertvector(
1274      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1275}
1276
1277/// \brief Converts the lower two integer elements of a 128-bit vector of
1278///    [4 x i32] into two double-precision floating-point values, returned in a
1279///    128-bit vector of [2 x double]. The upper two elements of the input
1280///    vector are unused.
1281///
1282/// \headerfile <x86intrin.h>
1283///
1284/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
1285///
1286/// \param __a
1287///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1288///    converted to double-precision values. The upper two elements are unused.
1289/// \returns A 128-bit vector of [2 x double] containing the converted values.
1290static __inline__ __m128d __DEFAULT_FN_ATTRS
1291_mm_cvtepi32_pd(__m128i __a)
1292{
1293  return (__m128d) __builtin_convertvector(
1294      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1295}
1296
1297/// \brief Converts the two double-precision floating-point elements of a
1298///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1299///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1300///    64 bits of the result vector are set to zero.
1301///
1302/// \headerfile <x86intrin.h>
1303///
1304/// This intrinsic corresponds to the \c VCVTPD2DQ / CVTPD2DQ instruction.
1305///
1306/// \param __a
1307///    A 128-bit vector of [2 x double].
1308/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1309///    converted values. The upper 64 bits are set to zero.
1310static __inline__ __m128i __DEFAULT_FN_ATTRS
1311_mm_cvtpd_epi32(__m128d __a)
1312{
1313  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1314}
1315
1316/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
1317///    into a 32-bit signed integer value.
1318///
1319/// \headerfile <x86intrin.h>
1320///
1321/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1322///
1323/// \param __a
1324///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1325///    conversion.
1326/// \returns A 32-bit signed integer containing the converted value.
1327static __inline__ int __DEFAULT_FN_ATTRS
1328_mm_cvtsd_si32(__m128d __a)
1329{
1330  return __builtin_ia32_cvtsd2si((__v2df)__a);
1331}
1332
1333/// \brief Converts the lower double-precision floating-point element of a
1334///    128-bit vector of [2 x double], in the second parameter, into a
1335///    single-precision floating-point value, returned in the lower 32 bits of a
1336///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1337///    copied from the upper 96 bits of the first parameter.
1338///
1339/// \headerfile <x86intrin.h>
1340///
1341/// This intrinsic corresponds to the \c VCVTSD2SS / CVTSD2SS instruction.
1342///
1343/// \param __a
1344///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1345///    copied to the upper 96 bits of the result.
1346/// \param __b
1347///    A 128-bit vector of [2 x double]. The lower double-precision
1348///    floating-point element is used in the conversion.
1349/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1350///    converted value from the second parameter. The upper 96 bits are copied
1351///    from the upper 96 bits of the first parameter.
1352static __inline__ __m128 __DEFAULT_FN_ATTRS
1353_mm_cvtsd_ss(__m128 __a, __m128d __b)
1354{
1355  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356}
1357
1358/// \brief Converts a 32-bit signed integer value, in the second parameter, into
1359///    a double-precision floating-point value, returned in the lower 64 bits of
1360///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361///    are copied from the upper 64 bits of the first parameter.
1362///
1363/// \headerfile <x86intrin.h>
1364///
1365/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1366///
1367/// \param __a
1368///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369///    copied to the upper 64 bits of the result.
1370/// \param __b
1371///    A 32-bit signed integer containing the value to be converted.
1372/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373///    converted value from the second parameter. The upper 64 bits are copied
1374///    from the upper 64 bits of the first parameter.
1375static __inline__ __m128d __DEFAULT_FN_ATTRS
1376_mm_cvtsi32_sd(__m128d __a, int __b)
1377{
1378  __a[0] = __b;
1379  return __a;
1380}
1381
1382/// \brief Converts the lower single-precision floating-point element of a
1383///    128-bit vector of [4 x float], in the second parameter, into a
1384///    double-precision floating-point value, returned in the lower 64 bits of
1385///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1386///    are copied from the upper 64 bits of the first parameter.
1387///
1388/// \headerfile <x86intrin.h>
1389///
1390/// This intrinsic corresponds to the \c VCVTSS2SD / CVTSS2SD instruction.
1391///
1392/// \param __a
1393///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1394///    copied to the upper 64 bits of the result.
1395/// \param __b
1396///    A 128-bit vector of [4 x float]. The lower single-precision
1397///    floating-point element is used in the conversion.
1398/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1399///    converted value from the second parameter. The upper 64 bits are copied
1400///    from the upper 64 bits of the first parameter.
1401static __inline__ __m128d __DEFAULT_FN_ATTRS
1402_mm_cvtss_sd(__m128d __a, __m128 __b)
1403{
1404  __a[0] = __b[0];
1405  return __a;
1406}
1407
1408/// \brief Converts the two double-precision floating-point elements of a
1409///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1410///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
1411///    result of either conversion is inexact, the result is truncated (rounded
1412///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
1413///    of the result vector are set to zero.
1414///
1415/// \headerfile <x86intrin.h>
1416///
1417/// This intrinsic corresponds to the \c VCVTTPD2DQ / CVTTPD2DQ instruction.
1418///
1419/// \param __a
1420///    A 128-bit vector of [2 x double].
1421/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422///    converted values. The upper 64 bits are set to zero.
1423static __inline__ __m128i __DEFAULT_FN_ATTRS
1424_mm_cvttpd_epi32(__m128d __a)
1425{
1426  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1427}
1428
1429/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
1430///    signed integer value, truncating the result when it is inexact.
1431///
1432/// \headerfile <x86intrin.h>
1433///
1434/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1435///
1436/// \param __a
1437///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1438///    conversion.
1439/// \returns A 32-bit signed integer containing the converted value.
1440static __inline__ int __DEFAULT_FN_ATTRS
1441_mm_cvttsd_si32(__m128d __a)
1442{
1443  return __builtin_ia32_cvttsd2si((__v2df)__a);
1444}
1445
1446/// \brief Converts the two double-precision floating-point elements of a
1447///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1448///    returned in a 64-bit vector of [2 x i32].
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// This intrinsic corresponds to the \c CVTPD2PI instruction.
1453///
1454/// \param __a
1455///    A 128-bit vector of [2 x double].
1456/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1457static __inline__ __m64 __DEFAULT_FN_ATTRS
1458_mm_cvtpd_pi32(__m128d __a)
1459{
1460  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1461}
1462
1463/// \brief Converts the two double-precision floating-point elements of a
1464///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1465///    returned in a 64-bit vector of [2 x i32]. If the result of either
1466///    conversion is inexact, the result is truncated (rounded towards zero)
1467///    regardless of the current MXCSR setting.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// This intrinsic corresponds to the \c CVTTPD2PI instruction.
1472///
1473/// \param __a
1474///    A 128-bit vector of [2 x double].
1475/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1476static __inline__ __m64 __DEFAULT_FN_ATTRS
1477_mm_cvttpd_pi32(__m128d __a)
1478{
1479  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1480}
1481
1482/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
1483///    [2 x i32] into two double-precision floating-point values, returned in a
1484///    128-bit vector of [2 x double].
1485///
1486/// \headerfile <x86intrin.h>
1487///
1488/// This intrinsic corresponds to the \c CVTPI2PD instruction.
1489///
1490/// \param __a
1491///    A 64-bit vector of [2 x i32].
1492/// \returns A 128-bit vector of [2 x double] containing the converted values.
1493static __inline__ __m128d __DEFAULT_FN_ATTRS
1494_mm_cvtpi32_pd(__m64 __a)
1495{
1496  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1497}
1498
1499/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
1500///    a double-precision floating-point value.
1501///
1502/// \headerfile <x86intrin.h>
1503///
1504/// This intrinsic has no corresponding instruction.
1505///
1506/// \param __a
1507///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1508/// \returns A double-precision floating-point value copied from the lower 64
1509///    bits of __a.
1510static __inline__ double __DEFAULT_FN_ATTRS
1511_mm_cvtsd_f64(__m128d __a)
1512{
1513  return __a[0];
1514}
1515
1516/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
1517///    memory location.
1518///
1519/// \headerfile <x86intrin.h>
1520///
1521/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction.
1522///
1523/// \param __dp
1524///    A pointer to a 128-bit memory location. The address of the memory
1525///    location has to be 16-byte aligned.
1526/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1527static __inline__ __m128d __DEFAULT_FN_ATTRS
1528_mm_load_pd(double const *__dp)
1529{
1530  return *(__m128d*)__dp;
1531}
1532
1533/// \brief Loads a double-precision floating-point value from a specified memory
1534///    location and duplicates it to both vector elements of a 128-bit vector of
1535///    [2 x double].
1536///
1537/// \headerfile <x86intrin.h>
1538///
1539/// This intrinsic corresponds to the \c VMOVDDUP / MOVDDUP instruction.
1540///
1541/// \param __dp
1542///    A pointer to a memory location containing a double-precision value.
1543/// \returns A 128-bit vector of [2 x double] containing the loaded and
1544///    duplicated values.
1545static __inline__ __m128d __DEFAULT_FN_ATTRS
1546_mm_load1_pd(double const *__dp)
1547{
1548  struct __mm_load1_pd_struct {
1549    double __u;
1550  } __attribute__((__packed__, __may_alias__));
1551  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1552  return (__m128d){ __u, __u };
1553}
1554
1555#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
1556
1557/// \brief Loads two double-precision values, in reverse order, from an aligned
1558///    memory location into a 128-bit vector of [2 x double].
1559///
1560/// \headerfile <x86intrin.h>
1561///
1562/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction + needed
1563/// shuffling instructions. In AVX mode, the shuffling may be combined with the
1564/// \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1565///
1566/// \param __dp
1567///    A 16-byte aligned pointer to an array of double-precision values to be
1568///    loaded in reverse order.
1569/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1570///    values.
1571static __inline__ __m128d __DEFAULT_FN_ATTRS
1572_mm_loadr_pd(double const *__dp)
1573{
1574  __m128d __u = *(__m128d*)__dp;
1575  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1576}
1577
1578/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
1579///    unaligned memory location.
1580///
1581/// \headerfile <x86intrin.h>
1582///
1583/// This intrinsic corresponds to the \c VMOVUPD / MOVUPD instruction.
1584///
1585/// \param __dp
1586///    A pointer to a 128-bit memory location. The address of the memory
1587///    location does not have to be aligned.
1588/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1589static __inline__ __m128d __DEFAULT_FN_ATTRS
1590_mm_loadu_pd(double const *__dp)
1591{
1592  struct __loadu_pd {
1593    __m128d __v;
1594  } __attribute__((__packed__, __may_alias__));
1595  return ((struct __loadu_pd*)__dp)->__v;
1596}
1597
1598static __inline__ __m128i __DEFAULT_FN_ATTRS
1599_mm_loadu_si64(void const *__a)
1600{
1601  struct __loadu_si64 {
1602    long long __v;
1603  } __attribute__((__packed__, __may_alias__));
1604  long long __u = ((struct __loadu_si64*)__a)->__v;
1605  return (__m128i){__u, 0L};
1606}
1607
1608static __inline__ __m128d __DEFAULT_FN_ATTRS
1609_mm_load_sd(double const *__dp)
1610{
1611  struct __mm_load_sd_struct {
1612    double __u;
1613  } __attribute__((__packed__, __may_alias__));
1614  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1615  return (__m128d){ __u, 0 };
1616}
1617
1618/// \brief Loads a double-precision value into the high-order bits of a 128-bit
1619///    vector of [2 x double]. The low-order bits are copied from the low-order
1620///    bits of the first operand.
1621///
1622/// \headerfile <x86intrin.h>
1623///
1624/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1625///
1626/// \param __a
1627///    A 128-bit vector of [2 x double].
1628///    Bits [63:0] are written to bits [63:0] of the result.
1629/// \param __dp
1630///    A pointer to a 64-bit memory location containing a double-precision
1631///    floating-point value that is loaded. The loaded value is written to bits
1632///    [127:64] of the result. The address of the memory location does not have
1633///    to be aligned.
1634/// \returns A 128-bit vector of [2 x double] containing the moved values.
1635static __inline__ __m128d __DEFAULT_FN_ATTRS
1636_mm_loadh_pd(__m128d __a, double const *__dp)
1637{
1638  struct __mm_loadh_pd_struct {
1639    double __u;
1640  } __attribute__((__packed__, __may_alias__));
1641  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1642  return (__m128d){ __a[0], __u };
1643}
1644
1645/// \brief Loads a double-precision value into the low-order bits of a 128-bit
1646///    vector of [2 x double]. The high-order bits are copied from the
1647///    high-order bits of the first operand.
1648///
1649/// \headerfile <x86intrin.h>
1650///
1651/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1652///
1653/// \param __a
1654///    A 128-bit vector of [2 x double].
1655///    Bits [127:64] are written to bits [127:64] of the result.
1656/// \param __dp
1657///    A pointer to a 64-bit memory location containing a double-precision
1658///    floating-point value that is loaded. The loaded value is written to bits
1659///    [63:0] of the result. The address of the memory location does not have to
1660///    be aligned.
1661/// \returns A 128-bit vector of [2 x double] containing the moved values.
1662static __inline__ __m128d __DEFAULT_FN_ATTRS
1663_mm_loadl_pd(__m128d __a, double const *__dp)
1664{
1665  struct __mm_loadl_pd_struct {
1666    double __u;
1667  } __attribute__((__packed__, __may_alias__));
1668  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1669  return (__m128d){ __u, __a[1] };
1670}
1671
1672/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
1673///    unspecified content. This could be used as an argument to another
1674///    intrinsic function where the argument is required but the value is not
1675///    actually used.
1676///
1677/// \headerfile <x86intrin.h>
1678///
1679/// This intrinsic has no corresponding instruction.
1680///
1681/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1682///    content.
1683static __inline__ __m128d __DEFAULT_FN_ATTRS
1684_mm_undefined_pd(void)
1685{
1686  return (__m128d)__builtin_ia32_undef128();
1687}
1688
1689/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1690///    64 bits of the vector are initialized with the specified double-precision
1691///    floating-point value. The upper 64 bits are set to zero.
1692///
1693/// \headerfile <x86intrin.h>
1694///
1695/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1696///
1697/// \param __w
1698///    A double-precision floating-point value used to initialize the lower 64
1699///    bits of the result.
1700/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1701///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1702///    set to zero.
1703static __inline__ __m128d __DEFAULT_FN_ATTRS
1704_mm_set_sd(double __w)
1705{
1706  return (__m128d){ __w, 0 };
1707}
1708
1709/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
1710///    of the two double-precision floating-point vector elements set to the
1711///    specified double-precision floating-point value.
1712///
1713/// \headerfile <x86intrin.h>
1714///
1715/// This intrinsic corresponds to the \c VMOVDDUP / MOVLHPS instruction.
1716///
1717/// \param __w
1718///    A double-precision floating-point value used to initialize each vector
1719///    element of the result.
1720/// \returns An initialized 128-bit floating-point vector of [2 x double].
1721static __inline__ __m128d __DEFAULT_FN_ATTRS
1722_mm_set1_pd(double __w)
1723{
1724  return (__m128d){ __w, __w };
1725}
1726
1727/// \brief Constructs a 128-bit floating-point vector of [2 x double]
1728///    initialized with the specified double-precision floating-point values.
1729///
1730/// \headerfile <x86intrin.h>
1731///
1732/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
1733///
1734/// \param __w
1735///    A double-precision floating-point value used to initialize the upper 64
1736///    bits of the result.
1737/// \param __x
1738///    A double-precision floating-point value used to initialize the lower 64
1739///    bits of the result.
1740/// \returns An initialized 128-bit floating-point vector of [2 x double].
1741static __inline__ __m128d __DEFAULT_FN_ATTRS
1742_mm_set_pd(double __w, double __x)
1743{
1744  return (__m128d){ __x, __w };
1745}
1746
1747/// \brief Constructs a 128-bit floating-point vector of [2 x double],
1748///    initialized in reverse order with the specified double-precision
1749///    floating-point values.
1750///
1751/// \headerfile <x86intrin.h>
1752///
1753/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
1754///
1755/// \param __w
1756///    A double-precision floating-point value used to initialize the lower 64
1757///    bits of the result.
1758/// \param __x
1759///    A double-precision floating-point value used to initialize the upper 64
1760///    bits of the result.
1761/// \returns An initialized 128-bit floating-point vector of [2 x double].
1762static __inline__ __m128d __DEFAULT_FN_ATTRS
1763_mm_setr_pd(double __w, double __x)
1764{
1765  return (__m128d){ __w, __x };
1766}
1767
1768/// \brief Constructs a 128-bit floating-point vector of [2 x double]
1769///    initialized to zero.
1770///
1771/// \headerfile <x86intrin.h>
1772///
1773/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
1774///
1775/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1776///    all elements set to zero.
1777static __inline__ __m128d __DEFAULT_FN_ATTRS
1778_mm_setzero_pd(void)
1779{
1780  return (__m128d){ 0, 0 };
1781}
1782
1783/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1784///    64 bits are set to the lower 64 bits of the second parameter. The upper
1785///    64 bits are set to the upper 64 bits of the first parameter.
1786//
1787/// \headerfile <x86intrin.h>
1788///
1789/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
1790///
1791/// \param __a
1792///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1793///    upper 64 bits of the result.
1794/// \param __b
1795///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1796///    lower 64 bits of the result.
1797/// \returns A 128-bit vector of [2 x double] containing the moved values.
1798static __inline__ __m128d __DEFAULT_FN_ATTRS
1799_mm_move_sd(__m128d __a, __m128d __b)
1800{
1801  return (__m128d){ __b[0], __a[1] };
1802}
1803
1804/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1805///    memory location.
1806///
1807/// \headerfile <x86intrin.h>
1808///
1809/// This intrinsic corresponds to the \c VMOVSD / MOVSD instruction.
1810///
1811/// \param __dp
1812///    A pointer to a 64-bit memory location.
1813/// \param __a
1814///    A 128-bit vector of [2 x double] containing the value to be stored.
1815static __inline__ void __DEFAULT_FN_ATTRS
1816_mm_store_sd(double *__dp, __m128d __a)
1817{
1818  struct __mm_store_sd_struct {
1819    double __u;
1820  } __attribute__((__packed__, __may_alias__));
1821  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1822}
1823
1824static __inline__ void __DEFAULT_FN_ATTRS
1825_mm_store_pd(double *__dp, __m128d __a)
1826{
1827  *(__m128d*)__dp = __a;
1828}
1829
1830static __inline__ void __DEFAULT_FN_ATTRS
1831_mm_store1_pd(double *__dp, __m128d __a)
1832{
1833  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1834  _mm_store_pd(__dp, __a);
1835}
1836
1837/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
1838///    location.
1839///
1840/// \headerfile <x86intrin.h>
1841///
1842/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction.
1843///
1844/// \param __dp
1845///    A pointer to a 128-bit memory location. The address of the memory
1846///    location has to be 16-byte aligned.
1847/// \param __a
1848///    A 128-bit vector of [2 x double] containing the values to be stored.
1849static __inline__ void __DEFAULT_FN_ATTRS
1850_mm_store_pd1(double *__dp, __m128d __a)
1851{
1852  return _mm_store1_pd(__dp, __a);
1853}
1854
1855/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
1856///    location.
1857///
1858/// \headerfile <x86intrin.h>
1859///
1860/// This intrinsic corresponds to the \c VMOVUPD / MOVUPD instruction.
1861///
1862/// \param __dp
1863///    A pointer to a 128-bit memory location. The address of the memory
1864///    location does not have to be aligned.
1865/// \param __a
1866///    A 128-bit vector of [2 x double] containing the values to be stored.
1867static __inline__ void __DEFAULT_FN_ATTRS
1868_mm_storeu_pd(double *__dp, __m128d __a)
1869{
1870  struct __storeu_pd {
1871    __m128d __v;
1872  } __attribute__((__packed__, __may_alias__));
1873  ((struct __storeu_pd*)__dp)->__v = __a;
1874}
1875
1876/// \brief Stores two double-precision values, in reverse order, from a 128-bit
1877///    vector of [2 x double] to a 16-byte aligned memory location.
1878///
1879/// \headerfile <x86intrin.h>
1880///
1881/// This intrinsic corresponds to a shuffling instruction followed by a
1882/// \c VMOVAPD / MOVAPD instruction.
1883///
1884/// \param __dp
1885///    A pointer to a 16-byte aligned memory location that can store two
1886///    double-precision values.
1887/// \param __a
1888///    A 128-bit vector of [2 x double] containing the values to be reversed and
1889///    stored.
1890static __inline__ void __DEFAULT_FN_ATTRS
1891_mm_storer_pd(double *__dp, __m128d __a)
1892{
1893  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1894  *(__m128d *)__dp = __a;
1895}
1896
1897/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1898///    memory location.
1899///
1900/// \headerfile <x86intrin.h>
1901///
1902/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1903///
1904/// \param __dp
1905///    A pointer to a 64-bit memory location.
1906/// \param __a
1907///    A 128-bit vector of [2 x double] containing the value to be stored.
1908static __inline__ void __DEFAULT_FN_ATTRS
1909_mm_storeh_pd(double *__dp, __m128d __a)
1910{
1911  struct __mm_storeh_pd_struct {
1912    double __u;
1913  } __attribute__((__packed__, __may_alias__));
1914  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
1915}
1916
1917/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1918///    memory location.
1919///
1920/// \headerfile <x86intrin.h>
1921///
1922/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1923///
1924/// \param __dp
1925///    A pointer to a 64-bit memory location.
1926/// \param __a
1927///    A 128-bit vector of [2 x double] containing the value to be stored.
1928static __inline__ void __DEFAULT_FN_ATTRS
1929_mm_storel_pd(double *__dp, __m128d __a)
1930{
1931  struct __mm_storeh_pd_struct {
1932    double __u;
1933  } __attribute__((__packed__, __may_alias__));
1934  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
1935}
1936
1937/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1938///    saving the lower 8 bits of each sum in the corresponding element of a
1939///    128-bit result vector of [16 x i8]. The integer elements of both
1940///    parameters can be either signed or unsigned.
1941///
1942/// \headerfile <x86intrin.h>
1943///
1944/// This intrinsic corresponds to the \c VPADDB / PADDB instruction.
1945///
1946/// \param __a
1947///    A 128-bit vector of [16 x i8].
1948/// \param __b
1949///    A 128-bit vector of [16 x i8].
1950/// \returns A 128-bit vector of [16 x i8] containing the sums of both
1951///    parameters.
1952static __inline__ __m128i __DEFAULT_FN_ATTRS
1953_mm_add_epi8(__m128i __a, __m128i __b)
1954{
1955  return (__m128i)((__v16qu)__a + (__v16qu)__b);
1956}
1957
1958/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
1959///    saving the lower 16 bits of each sum in the corresponding element of a
1960///    128-bit result vector of [8 x i16]. The integer elements of both
1961///    parameters can be either signed or unsigned.
1962///
1963/// \headerfile <x86intrin.h>
1964///
1965/// This intrinsic corresponds to the \c VPADDW / PADDW instruction.
1966///
1967/// \param __a
1968///    A 128-bit vector of [8 x i16].
1969/// \param __b
1970///    A 128-bit vector of [8 x i16].
1971/// \returns A 128-bit vector of [8 x i16] containing the sums of both
1972///    parameters.
1973static __inline__ __m128i __DEFAULT_FN_ATTRS
1974_mm_add_epi16(__m128i __a, __m128i __b)
1975{
1976  return (__m128i)((__v8hu)__a + (__v8hu)__b);
1977}
1978
1979/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
1980///    saving the lower 32 bits of each sum in the corresponding element of a
1981///    128-bit result vector of [4 x i32]. The integer elements of both
1982///    parameters can be either signed or unsigned.
1983///
1984/// \headerfile <x86intrin.h>
1985///
1986/// This intrinsic corresponds to the \c VPADDD / PADDD instruction.
1987///
1988/// \param __a
1989///    A 128-bit vector of [4 x i32].
1990/// \param __b
1991///    A 128-bit vector of [4 x i32].
1992/// \returns A 128-bit vector of [4 x i32] containing the sums of both
1993///    parameters.
1994static __inline__ __m128i __DEFAULT_FN_ATTRS
1995_mm_add_epi32(__m128i __a, __m128i __b)
1996{
1997  return (__m128i)((__v4su)__a + (__v4su)__b);
1998}
1999
2000/// \brief Adds two signed or unsigned 64-bit integer values, returning the
2001///    lower 64 bits of the sum.
2002///
2003/// \headerfile <x86intrin.h>
2004///
2005/// This intrinsic corresponds to the \c PADDQ instruction.
2006///
2007/// \param __a
2008///    A 64-bit integer.
2009/// \param __b
2010///    A 64-bit integer.
2011/// \returns A 64-bit integer containing the sum of both parameters.
2012static __inline__ __m64 __DEFAULT_FN_ATTRS
2013_mm_add_si64(__m64 __a, __m64 __b)
2014{
2015  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2016}
2017
2018/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2019///    saving the lower 64 bits of each sum in the corresponding element of a
2020///    128-bit result vector of [2 x i64]. The integer elements of both
2021///    parameters can be either signed or unsigned.
2022///
2023/// \headerfile <x86intrin.h>
2024///
2025/// This intrinsic corresponds to the \c VPADDQ / PADDQ instruction.
2026///
2027/// \param __a
2028///    A 128-bit vector of [2 x i64].
2029/// \param __b
2030///    A 128-bit vector of [2 x i64].
2031/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2032///    parameters.
2033static __inline__ __m128i __DEFAULT_FN_ATTRS
2034_mm_add_epi64(__m128i __a, __m128i __b)
2035{
2036  return (__m128i)((__v2du)__a + (__v2du)__b);
2037}
2038
2039/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2040///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2041///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
2042///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
2043///
2044/// \headerfile <x86intrin.h>
2045///
2046/// This intrinsic corresponds to the \c VPADDSB / PADDSB instruction.
2047///
2048/// \param __a
2049///    A 128-bit signed [16 x i8] vector.
2050/// \param __b
2051///    A 128-bit signed [16 x i8] vector.
2052/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2053///    both parameters.
2054static __inline__ __m128i __DEFAULT_FN_ATTRS
2055_mm_adds_epi8(__m128i __a, __m128i __b)
2056{
2057  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2058}
2059
2060/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2061///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2062///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
2063///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
2064///    8000h.
2065///
2066/// \headerfile <x86intrin.h>
2067///
2068/// This intrinsic corresponds to the \c VPADDSW / PADDSW instruction.
2069///
2070/// \param __a
2071///    A 128-bit signed [8 x i16] vector.
2072/// \param __b
2073///    A 128-bit signed [8 x i16] vector.
2074/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2075///    both parameters.
2076static __inline__ __m128i __DEFAULT_FN_ATTRS
2077_mm_adds_epi16(__m128i __a, __m128i __b)
2078{
2079  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2080}
2081
2082/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2083///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2084///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
2085///    are saturated to FFh. Negative sums are saturated to 00h.
2086///
2087/// \headerfile <x86intrin.h>
2088///
2089/// This intrinsic corresponds to the \c VPADDUSB / PADDUSB instruction.
2090///
2091/// \param __a
2092///    A 128-bit unsigned [16 x i8] vector.
2093/// \param __b
2094///    A 128-bit unsigned [16 x i8] vector.
2095/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2096///    of both parameters.
2097static __inline__ __m128i __DEFAULT_FN_ATTRS
2098_mm_adds_epu8(__m128i __a, __m128i __b)
2099{
2100  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2101}
2102
2103/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2104///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2105///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
2106///    are saturated to FFFFh. Negative sums are saturated to 0000h.
2107///
2108/// \headerfile <x86intrin.h>
2109///
2110/// This intrinsic corresponds to the \c VPADDUSB / PADDUSB instruction.
2111///
2112/// \param __a
2113///    A 128-bit unsigned [8 x i16] vector.
2114/// \param __b
2115///    A 128-bit unsigned [8 x i16] vector.
2116/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2117///    of both parameters.
2118static __inline__ __m128i __DEFAULT_FN_ATTRS
2119_mm_adds_epu16(__m128i __a, __m128i __b)
2120{
2121  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2122}
2123
2124/// \brief Computes the rounded avarages of corresponding elements of two
2125///    128-bit unsigned [16 x i8] vectors, saving each result in the
2126///    corresponding element of a 128-bit result vector of [16 x i8].
2127///
2128/// \headerfile <x86intrin.h>
2129///
2130/// This intrinsic corresponds to the \c VPAVGB / PAVGB instruction.
2131///
2132/// \param __a
2133///    A 128-bit unsigned [16 x i8] vector.
2134/// \param __b
2135///    A 128-bit unsigned [16 x i8] vector.
2136/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2137///    averages of both parameters.
2138static __inline__ __m128i __DEFAULT_FN_ATTRS
2139_mm_avg_epu8(__m128i __a, __m128i __b)
2140{
2141  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2142}
2143
2144/// \brief Computes the rounded avarages of corresponding elements of two
2145///    128-bit unsigned [8 x i16] vectors, saving each result in the
2146///    corresponding element of a 128-bit result vector of [8 x i16].
2147///
2148/// \headerfile <x86intrin.h>
2149///
2150/// This intrinsic corresponds to the \c VPAVGW / PAVGW instruction.
2151///
2152/// \param __a
2153///    A 128-bit unsigned [8 x i16] vector.
2154/// \param __b
2155///    A 128-bit unsigned [8 x i16] vector.
2156/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2157///    averages of both parameters.
2158static __inline__ __m128i __DEFAULT_FN_ATTRS
2159_mm_avg_epu16(__m128i __a, __m128i __b)
2160{
2161  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2162}
2163
2164/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2165///    vectors, producing eight intermediate 32-bit signed integer products, and
2166///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2167///    [4 x i32] vector. For example, bits [15:0] of both parameters are
2168///    multiplied producing a 32-bit product, bits [31:16] of both parameters
2169///    are multiplied producing a 32-bit product, and the sum of those two
2170///    products becomes bits [31:0] of the result.
2171///
2172/// \headerfile <x86intrin.h>
2173///
2174/// This intrinsic corresponds to the \c VPMADDWD / PMADDWD instruction.
2175///
2176/// \param __a
2177///    A 128-bit signed [8 x i16] vector.
2178/// \param __b
2179///    A 128-bit signed [8 x i16] vector.
2180/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2181///    of both parameters.
2182static __inline__ __m128i __DEFAULT_FN_ATTRS
2183_mm_madd_epi16(__m128i __a, __m128i __b)
2184{
2185  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2186}
2187
2188/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2189///    vectors, saving the greater value from each comparison in the
2190///    corresponding element of a 128-bit result vector of [8 x i16].
2191///
2192/// \headerfile <x86intrin.h>
2193///
2194/// This intrinsic corresponds to the \c VPMAXSW / PMAXSW instruction.
2195///
2196/// \param __a
2197///    A 128-bit signed [8 x i16] vector.
2198/// \param __b
2199///    A 128-bit signed [8 x i16] vector.
2200/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2201///    each comparison.
2202static __inline__ __m128i __DEFAULT_FN_ATTRS
2203_mm_max_epi16(__m128i __a, __m128i __b)
2204{
2205  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2206}
2207
2208/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2209///    vectors, saving the greater value from each comparison in the
2210///    corresponding element of a 128-bit result vector of [16 x i8].
2211///
2212/// \headerfile <x86intrin.h>
2213///
2214/// This intrinsic corresponds to the \c VPMAXUB / PMAXUB instruction.
2215///
2216/// \param __a
2217///    A 128-bit unsigned [16 x i8] vector.
2218/// \param __b
2219///    A 128-bit unsigned [16 x i8] vector.
2220/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2221///    each comparison.
2222static __inline__ __m128i __DEFAULT_FN_ATTRS
2223_mm_max_epu8(__m128i __a, __m128i __b)
2224{
2225  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2226}
2227
2228/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2229///    vectors, saving the smaller value from each comparison in the
2230///    corresponding element of a 128-bit result vector of [8 x i16].
2231///
2232/// \headerfile <x86intrin.h>
2233///
2234/// This intrinsic corresponds to the \c VPMINSW / PMINSW instruction.
2235///
2236/// \param __a
2237///    A 128-bit signed [8 x i16] vector.
2238/// \param __b
2239///    A 128-bit signed [8 x i16] vector.
2240/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2241///    each comparison.
2242static __inline__ __m128i __DEFAULT_FN_ATTRS
2243_mm_min_epi16(__m128i __a, __m128i __b)
2244{
2245  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2246}
2247
2248/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2249///    vectors, saving the smaller value from each comparison in the
2250///    corresponding element of a 128-bit result vector of [16 x i8].
2251///
2252/// \headerfile <x86intrin.h>
2253///
2254/// This intrinsic corresponds to the \c VPMINUB / PMINUB instruction.
2255///
2256/// \param __a
2257///    A 128-bit unsigned [16 x i8] vector.
2258/// \param __b
2259///    A 128-bit unsigned [16 x i8] vector.
2260/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2261///    each comparison.
2262static __inline__ __m128i __DEFAULT_FN_ATTRS
2263_mm_min_epu8(__m128i __a, __m128i __b)
2264{
2265  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2266}
2267
2268/// \brief Multiplies the corresponding elements of two signed [8 x i16]
2269///    vectors, saving the upper 16 bits of each 32-bit product in the
2270///    corresponding element of a 128-bit signed [8 x i16] result vector.
2271///
2272/// \headerfile <x86intrin.h>
2273///
2274/// This intrinsic corresponds to the \c VPMULHW / PMULHW instruction.
2275///
2276/// \param __a
2277///    A 128-bit signed [8 x i16] vector.
2278/// \param __b
2279///    A 128-bit signed [8 x i16] vector.
2280/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2281///    each of the eight 32-bit products.
2282static __inline__ __m128i __DEFAULT_FN_ATTRS
2283_mm_mulhi_epi16(__m128i __a, __m128i __b)
2284{
2285  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2286}
2287
2288/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
2289///    vectors, saving the upper 16 bits of each 32-bit product in the
2290///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2291///
2292/// \headerfile <x86intrin.h>
2293///
2294/// This intrinsic corresponds to the \c VPMULHUW / PMULHUW instruction.
2295///
2296/// \param __a
2297///    A 128-bit unsigned [8 x i16] vector.
2298/// \param __b
2299///    A 128-bit unsigned [8 x i16] vector.
2300/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2301///    of each of the eight 32-bit products.
2302static __inline__ __m128i __DEFAULT_FN_ATTRS
2303_mm_mulhi_epu16(__m128i __a, __m128i __b)
2304{
2305  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2306}
2307
2308/// \brief Multiplies the corresponding elements of two signed [8 x i16]
2309///    vectors, saving the lower 16 bits of each 32-bit product in the
2310///    corresponding element of a 128-bit signed [8 x i16] result vector.
2311///
2312/// \headerfile <x86intrin.h>
2313///
2314/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
2315///
2316/// \param __a
2317///    A 128-bit signed [8 x i16] vector.
2318/// \param __b
2319///    A 128-bit signed [8 x i16] vector.
2320/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2321///    each of the eight 32-bit products.
2322static __inline__ __m128i __DEFAULT_FN_ATTRS
2323_mm_mullo_epi16(__m128i __a, __m128i __b)
2324{
2325  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2326}
2327
2328/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
2329///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2330///    product.
2331///
2332/// \headerfile <x86intrin.h>
2333///
2334/// This intrinsic corresponds to the \c PMULUDQ instruction.
2335///
2336/// \param __a
2337///    A 64-bit integer containing one of the source operands.
2338/// \param __b
2339///    A 64-bit integer containing one of the source operands.
2340/// \returns A 64-bit integer vector containing the product of both operands.
2341static __inline__ __m64 __DEFAULT_FN_ATTRS
2342_mm_mul_su32(__m64 __a, __m64 __b)
2343{
2344  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2345}
2346
2347/// \brief Multiplies 32-bit unsigned integer values contained in the lower
2348///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2349///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2350///
2351/// \headerfile <x86intrin.h>
2352///
2353/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
2354///
2355/// \param __a
2356///    A [2 x i64] vector containing one of the source operands.
2357/// \param __b
2358///    A [2 x i64] vector containing one of the source operands.
2359/// \returns A [2 x i64] vector containing the product of both operands.
2360static __inline__ __m128i __DEFAULT_FN_ATTRS
2361_mm_mul_epu32(__m128i __a, __m128i __b)
2362{
2363  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2364}
2365
2366/// \brief Computes the absolute differences of corresponding 8-bit integer
2367///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2368///    separately sums the second 8 absolute differences. Packss these two
2369///    unsigned 16-bit integer sums into the upper and lower elements of a
2370///    [2 x i64] vector.
2371///
2372/// \headerfile <x86intrin.h>
2373///
2374/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
2375///
2376/// \param __a
2377///    A 128-bit integer vector containing one of the source operands.
2378/// \param __b
2379///    A 128-bit integer vector containing one of the source operands.
2380/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2381///    differences between both operands.
2382static __inline__ __m128i __DEFAULT_FN_ATTRS
2383_mm_sad_epu8(__m128i __a, __m128i __b)
2384{
2385  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2386}
2387
2388/// \brief Subtracts the corresponding 8-bit integer values in the operands.
2389///
2390/// \headerfile <x86intrin.h>
2391///
2392/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
2393///
2394/// \param __a
2395///    A 128-bit integer vector containing the minuends.
2396/// \param __b
2397///    A 128-bit integer vector containing the subtrahends.
2398/// \returns A 128-bit integer vector containing the differences of the values
2399///    in the operands.
2400static __inline__ __m128i __DEFAULT_FN_ATTRS
2401_mm_sub_epi8(__m128i __a, __m128i __b)
2402{
2403  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2404}
2405
2406/// \brief Subtracts the corresponding 16-bit integer values in the operands.
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
2411///
2412/// \param __a
2413///    A 128-bit integer vector containing the minuends.
2414/// \param __b
2415///    A 128-bit integer vector containing the subtrahends.
2416/// \returns A 128-bit integer vector containing the differences of the values
2417///    in the operands.
2418static __inline__ __m128i __DEFAULT_FN_ATTRS
2419_mm_sub_epi16(__m128i __a, __m128i __b)
2420{
2421  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2422}
2423
2424/// \brief Subtracts the corresponding 32-bit integer values in the operands.
2425///
2426/// \headerfile <x86intrin.h>
2427///
2428/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
2429///
2430/// \param __a
2431///    A 128-bit integer vector containing the minuends.
2432/// \param __b
2433///    A 128-bit integer vector containing the subtrahends.
2434/// \returns A 128-bit integer vector containing the differences of the values
2435///    in the operands.
2436static __inline__ __m128i __DEFAULT_FN_ATTRS
2437_mm_sub_epi32(__m128i __a, __m128i __b)
2438{
2439  return (__m128i)((__v4su)__a - (__v4su)__b);
2440}
2441
2442/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
2443///    difference to the corresponding bits in the destination.
2444///
2445/// \headerfile <x86intrin.h>
2446///
2447/// This intrinsic corresponds to the \c PSUBQ instruction.
2448///
2449/// \param __a
2450///    A 64-bit integer vector containing the minuend.
2451/// \param __b
2452///    A 64-bit integer vector containing the subtrahend.
2453/// \returns A 64-bit integer vector containing the difference of the values in
2454///    the operands.
2455static __inline__ __m64 __DEFAULT_FN_ATTRS
2456_mm_sub_si64(__m64 __a, __m64 __b)
2457{
2458  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2459}
2460
2461/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
2462///
2463/// \headerfile <x86intrin.h>
2464///
2465/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
2466///
2467/// \param __a
2468///    A 128-bit integer vector containing the minuends.
2469/// \param __b
2470///    A 128-bit integer vector containing the subtrahends.
2471/// \returns A 128-bit integer vector containing the differences of the values
2472///    in the operands.
2473static __inline__ __m128i __DEFAULT_FN_ATTRS
2474_mm_sub_epi64(__m128i __a, __m128i __b)
2475{
2476  return (__m128i)((__v2du)__a - (__v2du)__b);
2477}
2478
2479/// \brief Subtracts corresponding 8-bit signed integer values in the input and
2480///    returns the differences in the corresponding bytes in the destination.
2481///    Differences greater than 7Fh are saturated to 7Fh, and differences less
2482///    than 80h are saturated to 80h.
2483///
2484/// \headerfile <x86intrin.h>
2485///
2486/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
2487///
2488/// \param __a
2489///    A 128-bit integer vector containing the minuends.
2490/// \param __b
2491///    A 128-bit integer vector containing the subtrahends.
2492/// \returns A 128-bit integer vector containing the differences of the values
2493///    in the operands.
2494static __inline__ __m128i __DEFAULT_FN_ATTRS
2495_mm_subs_epi8(__m128i __a, __m128i __b)
2496{
2497  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2498}
2499
2500/// \brief Subtracts corresponding 16-bit signed integer values in the input and
2501///    returns the differences in the corresponding bytes in the destination.
2502///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
2503///    than 8000h are saturated to 8000h.
2504///
2505/// \headerfile <x86intrin.h>
2506///
2507/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
2508///
2509/// \param __a
2510///    A 128-bit integer vector containing the minuends.
2511/// \param __b
2512///    A 128-bit integer vector containing the subtrahends.
2513/// \returns A 128-bit integer vector containing the differences of the values
2514///    in the operands.
2515static __inline__ __m128i __DEFAULT_FN_ATTRS
2516_mm_subs_epi16(__m128i __a, __m128i __b)
2517{
2518  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2519}
2520
2521/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
2522///    and returns the differences in the corresponding bytes in the
2523///    destination. Differences less than 00h are saturated to 00h.
2524///
2525/// \headerfile <x86intrin.h>
2526///
2527/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
2528///
2529/// \param __a
2530///    A 128-bit integer vector containing the minuends.
2531/// \param __b
2532///    A 128-bit integer vector containing the subtrahends.
2533/// \returns A 128-bit integer vector containing the unsigned integer
2534///    differences of the values in the operands.
2535static __inline__ __m128i __DEFAULT_FN_ATTRS
2536_mm_subs_epu8(__m128i __a, __m128i __b)
2537{
2538  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2539}
2540
2541/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
2542///    and returns the differences in the corresponding bytes in the
2543///    destination. Differences less than 0000h are saturated to 0000h.
2544///
2545/// \headerfile <x86intrin.h>
2546///
2547/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
2548///
2549/// \param __a
2550///    A 128-bit integer vector containing the minuends.
2551/// \param __b
2552///    A 128-bit integer vector containing the subtrahends.
2553/// \returns A 128-bit integer vector containing the unsigned integer
2554///    differences of the values in the operands.
2555static __inline__ __m128i __DEFAULT_FN_ATTRS
2556_mm_subs_epu16(__m128i __a, __m128i __b)
2557{
2558  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2559}
2560
2561/// \brief Performs a bitwise AND of two 128-bit integer vectors.
2562///
2563/// \headerfile <x86intrin.h>
2564///
2565/// This intrinsic corresponds to the \c VPAND / PAND instruction.
2566///
2567/// \param __a
2568///    A 128-bit integer vector containing one of the source operands.
2569/// \param __b
2570///    A 128-bit integer vector containing one of the source operands.
2571/// \returns A 128-bit integer vector containing the bitwise AND of the values
2572///    in both operands.
2573static __inline__ __m128i __DEFAULT_FN_ATTRS
2574_mm_and_si128(__m128i __a, __m128i __b)
2575{
2576  return (__m128i)((__v2du)__a & (__v2du)__b);
2577}
2578
2579/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
2580///    one's complement of the values contained in the first source operand.
2581///
2582/// \headerfile <x86intrin.h>
2583///
2584/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
2585///
2586/// \param __a
2587///    A 128-bit vector containing the left source operand. The one's complement
2588///    of this value is used in the bitwise AND.
2589/// \param __b
2590///    A 128-bit vector containing the right source operand.
2591/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2592///    complement of the first operand and the values in the second operand.
2593static __inline__ __m128i __DEFAULT_FN_ATTRS
2594_mm_andnot_si128(__m128i __a, __m128i __b)
2595{
2596  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2597}
2598/// \brief Performs a bitwise OR of two 128-bit integer vectors.
2599///
2600/// \headerfile <x86intrin.h>
2601///
2602/// This intrinsic corresponds to the \c VPOR / POR instruction.
2603///
2604/// \param __a
2605///    A 128-bit integer vector containing one of the source operands.
2606/// \param __b
2607///    A 128-bit integer vector containing one of the source operands.
2608/// \returns A 128-bit integer vector containing the bitwise OR of the values
2609///    in both operands.
2610static __inline__ __m128i __DEFAULT_FN_ATTRS
2611_mm_or_si128(__m128i __a, __m128i __b)
2612{
2613  return (__m128i)((__v2du)__a | (__v2du)__b);
2614}
2615
2616/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
2617///
2618/// \headerfile <x86intrin.h>
2619///
2620/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
2621///
2622/// \param __a
2623///    A 128-bit integer vector containing one of the source operands.
2624/// \param __b
2625///    A 128-bit integer vector containing one of the source operands.
2626/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2627///    values in both operands.
2628static __inline__ __m128i __DEFAULT_FN_ATTRS
2629_mm_xor_si128(__m128i __a, __m128i __b)
2630{
2631  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2632}
2633
2634/// \brief Left-shifts the 128-bit integer vector operand by the specified
2635///    number of bytes. Low-order bits are cleared.
2636///
2637/// \headerfile <x86intrin.h>
2638///
2639/// \code
2640/// __m128i _mm_slli_si128(__m128i a, const int imm);
2641/// \endcode
2642///
2643/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
2644///
2645/// \param a
2646///    A 128-bit integer vector containing the source operand.
2647/// \param imm
2648///    An immediate value specifying the number of bytes to left-shift
2649///    operand a.
2650/// \returns A 128-bit integer vector containing the left-shifted value.
2651#define _mm_slli_si128(a, imm) __extension__ ({                              \
2652  (__m128i)__builtin_shufflevector(                                          \
2653                                 (__v16qi)_mm_setzero_si128(),               \
2654                                 (__v16qi)(__m128i)(a),                      \
2655                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
2656                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
2657                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
2658                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
2659                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
2660                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
2661                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
2662                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
2663                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
2664                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
2665                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
2666                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
2667                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
2668                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
2669                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
2670                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
2671
2672#define _mm_bslli_si128(a, imm) \
2673  _mm_slli_si128((a), (imm))
2674
2675/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2676///    by the specified number of bits. Low-order bits are cleared.
2677///
2678/// \headerfile <x86intrin.h>
2679///
2680/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
2681///
2682/// \param __a
2683///    A 128-bit integer vector containing the source operand.
2684/// \param __count
2685///    An integer value specifying the number of bits to left-shift each value
2686///    in operand __a.
2687/// \returns A 128-bit integer vector containing the left-shifted values.
2688static __inline__ __m128i __DEFAULT_FN_ATTRS
2689_mm_slli_epi16(__m128i __a, int __count)
2690{
2691  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2692}
2693
2694/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2695///    by the specified number of bits. Low-order bits are cleared.
2696///
2697/// \headerfile <x86intrin.h>
2698///
2699/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
2700///
2701/// \param __a
2702///    A 128-bit integer vector containing the source operand.
2703/// \param __count
2704///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2705///    to left-shift each value in operand __a.
2706/// \returns A 128-bit integer vector containing the left-shifted values.
2707static __inline__ __m128i __DEFAULT_FN_ATTRS
2708_mm_sll_epi16(__m128i __a, __m128i __count)
2709{
2710  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2711}
2712
2713/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2714///    by the specified number of bits. Low-order bits are cleared.
2715///
2716/// \headerfile <x86intrin.h>
2717///
2718/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
2719///
2720/// \param __a
2721///    A 128-bit integer vector containing the source operand.
2722/// \param __count
2723///    An integer value specifying the number of bits to left-shift each value
2724///    in operand __a.
2725/// \returns A 128-bit integer vector containing the left-shifted values.
2726static __inline__ __m128i __DEFAULT_FN_ATTRS
2727_mm_slli_epi32(__m128i __a, int __count)
2728{
2729  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2730}
2731
2732/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2733///    by the specified number of bits. Low-order bits are cleared.
2734///
2735/// \headerfile <x86intrin.h>
2736///
2737/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
2738///
2739/// \param __a
2740///    A 128-bit integer vector containing the source operand.
2741/// \param __count
2742///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2743///    to left-shift each value in operand __a.
2744/// \returns A 128-bit integer vector containing the left-shifted values.
2745static __inline__ __m128i __DEFAULT_FN_ATTRS
2746_mm_sll_epi32(__m128i __a, __m128i __count)
2747{
2748  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2749}
2750
2751/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2752///    by the specified number of bits. Low-order bits are cleared.
2753///
2754/// \headerfile <x86intrin.h>
2755///
2756/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
2757///
2758/// \param __a
2759///    A 128-bit integer vector containing the source operand.
2760/// \param __count
2761///    An integer value specifying the number of bits to left-shift each value
2762///    in operand __a.
2763/// \returns A 128-bit integer vector containing the left-shifted values.
2764static __inline__ __m128i __DEFAULT_FN_ATTRS
2765_mm_slli_epi64(__m128i __a, int __count)
2766{
2767  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2768}
2769
2770/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2771///    by the specified number of bits. Low-order bits are cleared.
2772///
2773/// \headerfile <x86intrin.h>
2774///
2775/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
2776///
2777/// \param __a
2778///    A 128-bit integer vector containing the source operand.
2779/// \param __count
2780///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2781///    to left-shift each value in operand __a.
2782/// \returns A 128-bit integer vector containing the left-shifted values.
2783static __inline__ __m128i __DEFAULT_FN_ATTRS
2784_mm_sll_epi64(__m128i __a, __m128i __count)
2785{
2786  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2787}
2788
2789/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2790///    by the specified number of bits. High-order bits are filled with the sign
2791///    bit of the initial value.
2792///
2793/// \headerfile <x86intrin.h>
2794///
2795/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
2796///
2797/// \param __a
2798///    A 128-bit integer vector containing the source operand.
2799/// \param __count
2800///    An integer value specifying the number of bits to right-shift each value
2801///    in operand __a.
2802/// \returns A 128-bit integer vector containing the right-shifted values.
2803static __inline__ __m128i __DEFAULT_FN_ATTRS
2804_mm_srai_epi16(__m128i __a, int __count)
2805{
2806  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2807}
2808
2809/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2810///    by the specified number of bits. High-order bits are filled with the sign
2811///    bit of the initial value.
2812///
2813/// \headerfile <x86intrin.h>
2814///
2815/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
2816///
2817/// \param __a
2818///    A 128-bit integer vector containing the source operand.
2819/// \param __count
2820///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2821///    to right-shift each value in operand __a.
2822/// \returns A 128-bit integer vector containing the right-shifted values.
2823static __inline__ __m128i __DEFAULT_FN_ATTRS
2824_mm_sra_epi16(__m128i __a, __m128i __count)
2825{
2826  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2827}
2828
2829/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2830///    by the specified number of bits. High-order bits are filled with the sign
2831///    bit of the initial value.
2832///
2833/// \headerfile <x86intrin.h>
2834///
2835/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
2836///
2837/// \param __a
2838///    A 128-bit integer vector containing the source operand.
2839/// \param __count
2840///    An integer value specifying the number of bits to right-shift each value
2841///    in operand __a.
2842/// \returns A 128-bit integer vector containing the right-shifted values.
2843static __inline__ __m128i __DEFAULT_FN_ATTRS
2844_mm_srai_epi32(__m128i __a, int __count)
2845{
2846  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2847}
2848
2849/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2850///    by the specified number of bits. High-order bits are filled with the sign
2851///    bit of the initial value.
2852///
2853/// \headerfile <x86intrin.h>
2854///
2855/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
2856///
2857/// \param __a
2858///    A 128-bit integer vector containing the source operand.
2859/// \param __count
2860///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2861///    to right-shift each value in operand __a.
2862/// \returns A 128-bit integer vector containing the right-shifted values.
2863static __inline__ __m128i __DEFAULT_FN_ATTRS
2864_mm_sra_epi32(__m128i __a, __m128i __count)
2865{
2866  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2867}
2868
2869/// \brief Right-shifts the 128-bit integer vector operand by the specified
2870///    number of bytes. High-order bits are cleared.
2871///
2872/// \headerfile <x86intrin.h>
2873///
2874/// \code
2875/// __m128i _mm_srli_si128(__m128i a, const int imm);
2876/// \endcode
2877///
2878/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
2879///
2880/// \param a
2881///    A 128-bit integer vector containing the source operand.
2882/// \param imm
2883///    An immediate value specifying the number of bytes to right-shift operand
2884///    a.
2885/// \returns A 128-bit integer vector containing the right-shifted value.
2886#define _mm_srli_si128(a, imm) __extension__ ({                              \
2887  (__m128i)__builtin_shufflevector(                                          \
2888                                 (__v16qi)(__m128i)(a),                      \
2889                                 (__v16qi)_mm_setzero_si128(),               \
2890                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
2891                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
2892                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
2893                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
2894                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
2895                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
2896                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
2897                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
2898                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
2899                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
2900                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
2901                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
2902                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
2903                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
2904                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
2905                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
2906
2907#define _mm_bsrli_si128(a, imm) \
2908  _mm_srli_si128((a), (imm))
2909
2910/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2911///    operand by the specified number of bits. High-order bits are cleared.
2912///
2913/// \headerfile <x86intrin.h>
2914///
2915/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
2916///
2917/// \param __a
2918///    A 128-bit integer vector containing the source operand.
2919/// \param __count
2920///    An integer value specifying the number of bits to right-shift each value
2921///    in operand __a.
2922/// \returns A 128-bit integer vector containing the right-shifted values.
2923static __inline__ __m128i __DEFAULT_FN_ATTRS
2924_mm_srli_epi16(__m128i __a, int __count)
2925{
2926  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2927}
2928
2929/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2930///    operand by the specified number of bits. High-order bits are cleared.
2931///
2932/// \headerfile <x86intrin.h>
2933///
2934/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
2935///
2936/// \param __a
2937///    A 128-bit integer vector containing the source operand.
2938/// \param __count
2939///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2940///    to right-shift each value in operand __a.
2941/// \returns A 128-bit integer vector containing the right-shifted values.
2942static __inline__ __m128i __DEFAULT_FN_ATTRS
2943_mm_srl_epi16(__m128i __a, __m128i __count)
2944{
2945  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2946}
2947
2948/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2949///    operand by the specified number of bits. High-order bits are cleared.
2950///
2951/// \headerfile <x86intrin.h>
2952///
2953/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
2954///
2955/// \param __a
2956///    A 128-bit integer vector containing the source operand.
2957/// \param __count
2958///    An integer value specifying the number of bits to right-shift each value
2959///    in operand __a.
2960/// \returns A 128-bit integer vector containing the right-shifted values.
2961static __inline__ __m128i __DEFAULT_FN_ATTRS
2962_mm_srli_epi32(__m128i __a, int __count)
2963{
2964  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2965}
2966
2967/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2968///    operand by the specified number of bits. High-order bits are cleared.
2969///
2970/// \headerfile <x86intrin.h>
2971///
2972/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
2973///
2974/// \param __a
2975///    A 128-bit integer vector containing the source operand.
2976/// \param __count
2977///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2978///    to right-shift each value in operand __a.
2979/// \returns A 128-bit integer vector containing the right-shifted values.
2980static __inline__ __m128i __DEFAULT_FN_ATTRS
2981_mm_srl_epi32(__m128i __a, __m128i __count)
2982{
2983  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2984}
2985
2986/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
2987///    operand by the specified number of bits. High-order bits are cleared.
2988///
2989/// \headerfile <x86intrin.h>
2990///
2991/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
2992///
2993/// \param __a
2994///    A 128-bit integer vector containing the source operand.
2995/// \param __count
2996///    An integer value specifying the number of bits to right-shift each value
2997///    in operand __a.
2998/// \returns A 128-bit integer vector containing the right-shifted values.
2999static __inline__ __m128i __DEFAULT_FN_ATTRS
3000_mm_srli_epi64(__m128i __a, int __count)
3001{
3002  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3003}
3004
3005/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
3006///    operand by the specified number of bits. High-order bits are cleared.
3007///
3008/// \headerfile <x86intrin.h>
3009///
3010/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
3011///
3012/// \param __a
3013///    A 128-bit integer vector containing the source operand.
3014/// \param __count
3015///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3016///    to right-shift each value in operand __a.
3017/// \returns A 128-bit integer vector containing the right-shifted values.
3018static __inline__ __m128i __DEFAULT_FN_ATTRS
3019_mm_srl_epi64(__m128i __a, __m128i __count)
3020{
3021  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3022}
3023
3024/// \brief Compares each of the corresponding 8-bit values of the 128-bit
3025///    integer vectors for equality. Each comparison yields 0h for false, FFh
3026///    for true.
3027///
3028/// \headerfile <x86intrin.h>
3029///
3030/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
3031///
3032/// \param __a
3033///    A 128-bit integer vector.
3034/// \param __b
3035///    A 128-bit integer vector.
3036/// \returns A 128-bit integer vector containing the comparison results.
3037static __inline__ __m128i __DEFAULT_FN_ATTRS
3038_mm_cmpeq_epi8(__m128i __a, __m128i __b)
3039{
3040  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3041}
3042
3043/// \brief Compares each of the corresponding 16-bit values of the 128-bit
3044///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
3045///    for true.
3046///
3047/// \headerfile <x86intrin.h>
3048///
3049/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
3050///
3051/// \param __a
3052///    A 128-bit integer vector.
3053/// \param __b
3054///    A 128-bit integer vector.
3055/// \returns A 128-bit integer vector containing the comparison results.
3056static __inline__ __m128i __DEFAULT_FN_ATTRS
3057_mm_cmpeq_epi16(__m128i __a, __m128i __b)
3058{
3059  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3060}
3061
3062/// \brief Compares each of the corresponding 32-bit values of the 128-bit
3063///    integer vectors for equality. Each comparison yields 0h for false,
3064///    FFFFFFFFh for true.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
3069///
3070/// \param __a
3071///    A 128-bit integer vector.
3072/// \param __b
3073///    A 128-bit integer vector.
3074/// \returns A 128-bit integer vector containing the comparison results.
3075static __inline__ __m128i __DEFAULT_FN_ATTRS
3076_mm_cmpeq_epi32(__m128i __a, __m128i __b)
3077{
3078  return (__m128i)((__v4si)__a == (__v4si)__b);
3079}
3080
3081/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3082///    integer vectors to determine if the values in the first operand are
3083///    greater than those in the second operand. Each comparison yields 0h for
3084///    false, FFh for true.
3085///
3086/// \headerfile <x86intrin.h>
3087///
3088/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
3089///
3090/// \param __a
3091///    A 128-bit integer vector.
3092/// \param __b
3093///    A 128-bit integer vector.
3094/// \returns A 128-bit integer vector containing the comparison results.
3095static __inline__ __m128i __DEFAULT_FN_ATTRS
3096_mm_cmpgt_epi8(__m128i __a, __m128i __b)
3097{
3098  /* This function always performs a signed comparison, but __v16qi is a char
3099     which may be signed or unsigned, so use __v16qs. */
3100  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3101}
3102
3103/// \brief Compares each of the corresponding signed 16-bit values of the
3104///    128-bit integer vectors to determine if the values in the first operand
3105///    are greater than those in the second operand. Each comparison yields 0h
3106///    for false, FFFFh for true.
3107///
3108/// \headerfile <x86intrin.h>
3109///
3110/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
3111///
3112/// \param __a
3113///    A 128-bit integer vector.
3114/// \param __b
3115///    A 128-bit integer vector.
3116/// \returns A 128-bit integer vector containing the comparison results.
3117static __inline__ __m128i __DEFAULT_FN_ATTRS
3118_mm_cmpgt_epi16(__m128i __a, __m128i __b)
3119{
3120  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3121}
3122
3123/// \brief Compares each of the corresponding signed 32-bit values of the
3124///    128-bit integer vectors to determine if the values in the first operand
3125///    are greater than those in the second operand. Each comparison yields 0h
3126///    for false, FFFFFFFFh for true.
3127///
3128/// \headerfile <x86intrin.h>
3129///
3130/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
3131///
3132/// \param __a
3133///    A 128-bit integer vector.
3134/// \param __b
3135///    A 128-bit integer vector.
3136/// \returns A 128-bit integer vector containing the comparison results.
3137static __inline__ __m128i __DEFAULT_FN_ATTRS
3138_mm_cmpgt_epi32(__m128i __a, __m128i __b)
3139{
3140  return (__m128i)((__v4si)__a > (__v4si)__b);
3141}
3142
3143/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3144///    integer vectors to determine if the values in the first operand are less
3145///    than those in the second operand. Each comparison yields 0h for false,
3146///    FFh for true.
3147///
3148/// \headerfile <x86intrin.h>
3149///
3150/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
3151///
3152/// \param __a
3153///    A 128-bit integer vector.
3154/// \param __b
3155///    A 128-bit integer vector.
3156/// \returns A 128-bit integer vector containing the comparison results.
3157static __inline__ __m128i __DEFAULT_FN_ATTRS
3158_mm_cmplt_epi8(__m128i __a, __m128i __b)
3159{
3160  return _mm_cmpgt_epi8(__b, __a);
3161}
3162
3163/// \brief Compares each of the corresponding signed 16-bit values of the
3164///    128-bit integer vectors to determine if the values in the first operand
3165///    are less than those in the second operand. Each comparison yields 0h for
3166///    false, FFFFh for true.
3167///
3168/// \headerfile <x86intrin.h>
3169///
3170/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
3171///
3172/// \param __a
3173///    A 128-bit integer vector.
3174/// \param __b
3175///    A 128-bit integer vector.
3176/// \returns A 128-bit integer vector containing the comparison results.
3177static __inline__ __m128i __DEFAULT_FN_ATTRS
3178_mm_cmplt_epi16(__m128i __a, __m128i __b)
3179{
3180  return _mm_cmpgt_epi16(__b, __a);
3181}
3182
3183/// \brief Compares each of the corresponding signed 32-bit values of the
3184///    128-bit integer vectors to determine if the values in the first operand
3185///    are less than those in the second operand. Each comparison yields 0h for
3186///    false, FFFFFFFFh for true.
3187///
3188/// \headerfile <x86intrin.h>
3189///
3190/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
3191///
3192/// \param __a
3193///    A 128-bit integer vector.
3194/// \param __b
3195///    A 128-bit integer vector.
3196/// \returns A 128-bit integer vector containing the comparison results.
3197static __inline__ __m128i __DEFAULT_FN_ATTRS
3198_mm_cmplt_epi32(__m128i __a, __m128i __b)
3199{
3200  return _mm_cmpgt_epi32(__b, __a);
3201}
3202
3203#ifdef __x86_64__
3204/// \brief Converts a 64-bit signed integer value from the second operand into a
3205///    double-precision value and returns it in the lower element of a [2 x
3206///    double] vector; the upper element of the returned vector is copied from
3207///    the upper element of the first operand.
3208///
3209/// \headerfile <x86intrin.h>
3210///
3211/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
3212///
3213/// \param __a
3214///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3215///    copied to the upper 64 bits of the destination.
3216/// \param __b
3217///    A 64-bit signed integer operand containing the value to be converted.
3218/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3219///    converted value of the second operand. The upper 64 bits are copied from
3220///    the upper 64 bits of the first operand.
3221static __inline__ __m128d __DEFAULT_FN_ATTRS
3222_mm_cvtsi64_sd(__m128d __a, long long __b)
3223{
3224  __a[0] = __b;
3225  return __a;
3226}
3227
3228/// \brief Converts the first (lower) element of a vector of [2 x double] into a
3229///    64-bit signed integer value, according to the current rounding mode.
3230///
3231/// \headerfile <x86intrin.h>
3232///
3233/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
3234///
3235/// \param __a
3236///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3237///    conversion.
3238/// \returns A 64-bit signed integer containing the converted value.
3239static __inline__ long long __DEFAULT_FN_ATTRS
3240_mm_cvtsd_si64(__m128d __a)
3241{
3242  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3243}
3244
3245/// \brief Converts the first (lower) element of a vector of [2 x double] into a
3246///    64-bit signed integer value, truncating the result when it is inexact.
3247///
3248/// \headerfile <x86intrin.h>
3249///
3250/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
3251///
3252/// \param __a
3253///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3254///    conversion.
3255/// \returns A 64-bit signed integer containing the converted value.
3256static __inline__ long long __DEFAULT_FN_ATTRS
3257_mm_cvttsd_si64(__m128d __a)
3258{
3259  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3260}
3261#endif
3262
3263/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
3264///
3265/// \headerfile <x86intrin.h>
3266///
3267/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
3268///
3269/// \param __a
3270///    A 128-bit integer vector.
3271/// \returns A 128-bit vector of [4 x float] containing the converted values.
3272static __inline__ __m128 __DEFAULT_FN_ATTRS
3273_mm_cvtepi32_ps(__m128i __a)
3274{
3275  return __builtin_ia32_cvtdq2ps((__v4si)__a);
3276}
3277
3278/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
3279///
3280/// \headerfile <x86intrin.h>
3281///
3282/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
3283///
3284/// \param __a
3285///    A 128-bit vector of [4 x float].
3286/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3287///    values.
3288static __inline__ __m128i __DEFAULT_FN_ATTRS
3289_mm_cvtps_epi32(__m128 __a)
3290{
3291  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3292}
3293
3294/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
3295///    truncating the result when it is inexact.
3296///
3297/// \headerfile <x86intrin.h>
3298///
3299/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
3300///
3301/// \param __a
3302///    A 128-bit vector of [4 x float].
3303/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3304static __inline__ __m128i __DEFAULT_FN_ATTRS
3305_mm_cvttps_epi32(__m128 __a)
3306{
3307  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3308}
3309
3310/// \brief Returns a vector of [4 x i32] where the lowest element is the input
3311///    operand and the remaining elements are zero.
3312///
3313/// \headerfile <x86intrin.h>
3314///
3315/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
3316///
3317/// \param __a
3318///    A 32-bit signed integer operand.
3319/// \returns A 128-bit vector of [4 x i32].
3320static __inline__ __m128i __DEFAULT_FN_ATTRS
3321_mm_cvtsi32_si128(int __a)
3322{
3323  return (__m128i)(__v4si){ __a, 0, 0, 0 };
3324}
3325
3326#ifdef __x86_64__
3327/// \brief Returns a vector of [2 x i64] where the lower element is the input
3328///    operand and the upper element is zero.
3329///
3330/// \headerfile <x86intrin.h>
3331///
3332/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
3333///
3334/// \param __a
3335///    A 64-bit signed integer operand containing the value to be converted.
3336/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3337static __inline__ __m128i __DEFAULT_FN_ATTRS
3338_mm_cvtsi64_si128(long long __a)
3339{
3340  return (__m128i){ __a, 0 };
3341}
3342#endif
3343
3344/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
3345///    32-bit signed integer value.
3346///
3347/// \headerfile <x86intrin.h>
3348///
3349/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
3350///
3351/// \param __a
3352///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3353///    destination.
3354/// \returns A 32-bit signed integer containing the moved value.
3355static __inline__ int __DEFAULT_FN_ATTRS
3356_mm_cvtsi128_si32(__m128i __a)
3357{
3358  __v4si __b = (__v4si)__a;
3359  return __b[0];
3360}
3361
3362#ifdef __x86_64__
3363/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
3364///    64-bit signed integer value.
3365///
3366/// \headerfile <x86intrin.h>
3367///
3368/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
3369///
3370/// \param __a
3371///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3372///    destination.
3373/// \returns A 64-bit signed integer containing the moved value.
3374static __inline__ long long __DEFAULT_FN_ATTRS
3375_mm_cvtsi128_si64(__m128i __a)
3376{
3377  return __a[0];
3378}
3379#endif
3380
3381/// \brief Moves packed integer values from an aligned 128-bit memory location
3382///    to elements in a 128-bit integer vector.
3383///
3384/// \headerfile <x86intrin.h>
3385///
3386/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
3387///
3388/// \param __p
3389///    An aligned pointer to a memory location containing integer values.
3390/// \returns A 128-bit integer vector containing the moved values.
3391static __inline__ __m128i __DEFAULT_FN_ATTRS
3392_mm_load_si128(__m128i const *__p)
3393{
3394  return *__p;
3395}
3396
3397/// \brief Moves packed integer values from an unaligned 128-bit memory location
3398///    to elements in a 128-bit integer vector.
3399///
3400/// \headerfile <x86intrin.h>
3401///
3402/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
3403///
3404/// \param __p
3405///    A pointer to a memory location containing integer values.
3406/// \returns A 128-bit integer vector containing the moved values.
3407static __inline__ __m128i __DEFAULT_FN_ATTRS
3408_mm_loadu_si128(__m128i const *__p)
3409{
3410  struct __loadu_si128 {
3411    __m128i __v;
3412  } __attribute__((__packed__, __may_alias__));
3413  return ((struct __loadu_si128*)__p)->__v;
3414}
3415
3416/// \brief Returns a vector of [2 x i64] where the lower element is taken from
3417///    the lower element of the operand, and the upper element is zero.
3418///
3419/// \headerfile <x86intrin.h>
3420///
3421/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
3422///
3423/// \param __p
3424///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3425///    the destination.
3426/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3427///    moved value. The higher order bits are cleared.
3428static __inline__ __m128i __DEFAULT_FN_ATTRS
3429_mm_loadl_epi64(__m128i const *__p)
3430{
3431  struct __mm_loadl_epi64_struct {
3432    long long __u;
3433  } __attribute__((__packed__, __may_alias__));
3434  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3435}
3436
3437/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
3438///    This could be used as an argument to another intrinsic function where the
3439///    argument is required but the value is not actually used.
3440///
3441/// \headerfile <x86intrin.h>
3442///
3443/// This intrinsic has no corresponding instruction.
3444///
3445/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3446static __inline__ __m128i __DEFAULT_FN_ATTRS
3447_mm_undefined_si128(void)
3448{
3449  return (__m128i)__builtin_ia32_undef128();
3450}
3451
3452/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3453///    the specified 64-bit integer values.
3454///
3455/// \headerfile <x86intrin.h>
3456///
3457/// This intrinsic is a utility function and does not correspond to a specific
3458///    instruction.
3459///
3460/// \param __q1
3461///    A 64-bit integer value used to initialize the upper 64 bits of the
3462///    destination vector of [2 x i64].
3463/// \param __q0
3464///    A 64-bit integer value used to initialize the lower 64 bits of the
3465///    destination vector of [2 x i64].
3466/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3467///    provided in the operands.
3468static __inline__ __m128i __DEFAULT_FN_ATTRS
3469_mm_set_epi64x(long long __q1, long long __q0)
3470{
3471  return (__m128i){ __q0, __q1 };
3472}
3473
3474/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3475///    the specified 64-bit integer values.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic is a utility function and does not correspond to a specific
3480///    instruction.
3481///
3482/// \param __q1
3483///    A 64-bit integer value used to initialize the upper 64 bits of the
3484///    destination vector of [2 x i64].
3485/// \param __q0
3486///    A 64-bit integer value used to initialize the lower 64 bits of the
3487///    destination vector of [2 x i64].
3488/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3489///    provided in the operands.
3490static __inline__ __m128i __DEFAULT_FN_ATTRS
3491_mm_set_epi64(__m64 __q1, __m64 __q0)
3492{
3493  return (__m128i){ (long long)__q0, (long long)__q1 };
3494}
3495
3496/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3497///    the specified 32-bit integer values.
3498///
3499/// \headerfile <x86intrin.h>
3500///
3501/// This intrinsic is a utility function and does not correspond to a specific
3502///    instruction.
3503///
3504/// \param __i3
3505///    A 32-bit integer value used to initialize bits [127:96] of the
3506///    destination vector.
3507/// \param __i2
3508///    A 32-bit integer value used to initialize bits [95:64] of the destination
3509///    vector.
3510/// \param __i1
3511///    A 32-bit integer value used to initialize bits [63:32] of the destination
3512///    vector.
3513/// \param __i0
3514///    A 32-bit integer value used to initialize bits [31:0] of the destination
3515///    vector.
3516/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3517///    provided in the operands.
3518static __inline__ __m128i __DEFAULT_FN_ATTRS
3519_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3520{
3521  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3522}
3523
3524/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3525///    the specified 16-bit integer values.
3526///
3527/// \headerfile <x86intrin.h>
3528///
3529/// This intrinsic is a utility function and does not correspond to a specific
3530///    instruction.
3531///
3532/// \param __w7
3533///    A 16-bit integer value used to initialize bits [127:112] of the
3534///    destination vector.
3535/// \param __w6
3536///    A 16-bit integer value used to initialize bits [111:96] of the
3537///    destination vector.
3538/// \param __w5
3539///    A 16-bit integer value used to initialize bits [95:80] of the destination
3540///    vector.
3541/// \param __w4
3542///    A 16-bit integer value used to initialize bits [79:64] of the destination
3543///    vector.
3544/// \param __w3
3545///    A 16-bit integer value used to initialize bits [63:48] of the destination
3546///    vector.
3547/// \param __w2
3548///    A 16-bit integer value used to initialize bits [47:32] of the destination
3549///    vector.
3550/// \param __w1
3551///    A 16-bit integer value used to initialize bits [31:16] of the destination
3552///    vector.
3553/// \param __w0
3554///    A 16-bit integer value used to initialize bits [15:0] of the destination
3555///    vector.
3556/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3557///    provided in the operands.
3558static __inline__ __m128i __DEFAULT_FN_ATTRS
3559_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3560{
3561  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3562}
3563
3564/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3565///    the specified 8-bit integer values.
3566///
3567/// \headerfile <x86intrin.h>
3568///
3569/// This intrinsic is a utility function and does not correspond to a specific
3570///    instruction.
3571///
3572/// \param __b15
3573///    Initializes bits [127:120] of the destination vector.
3574/// \param __b14
3575///    Initializes bits [119:112] of the destination vector.
3576/// \param __b13
3577///    Initializes bits [111:104] of the destination vector.
3578/// \param __b12
3579///    Initializes bits [103:96] of the destination vector.
3580/// \param __b11
3581///    Initializes bits [95:88] of the destination vector.
3582/// \param __b10
3583///    Initializes bits [87:80] of the destination vector.
3584/// \param __b9
3585///    Initializes bits [79:72] of the destination vector.
3586/// \param __b8
3587///    Initializes bits [71:64] of the destination vector.
3588/// \param __b7
3589///    Initializes bits [63:56] of the destination vector.
3590/// \param __b6
3591///    Initializes bits [55:48] of the destination vector.
3592/// \param __b5
3593///    Initializes bits [47:40] of the destination vector.
3594/// \param __b4
3595///    Initializes bits [39:32] of the destination vector.
3596/// \param __b3
3597///    Initializes bits [31:24] of the destination vector.
3598/// \param __b2
3599///    Initializes bits [23:16] of the destination vector.
3600/// \param __b1
3601///    Initializes bits [15:8] of the destination vector.
3602/// \param __b0
3603///    Initializes bits [7:0] of the destination vector.
3604/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3605///    provided in the operands.
3606static __inline__ __m128i __DEFAULT_FN_ATTRS
3607_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3608{
3609  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3610}
3611
3612/// \brief Initializes both values in a 128-bit integer vector with the
3613///    specified 64-bit integer value.
3614///
3615/// \headerfile <x86intrin.h>
3616///
3617/// This intrinsic is a utility function and does not correspond to a specific
3618///    instruction.
3619///
3620/// \param __q
3621///    Integer value used to initialize the elements of the destination integer
3622///    vector.
3623/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3624///    elements containing the value provided in the operand.
3625static __inline__ __m128i __DEFAULT_FN_ATTRS
3626_mm_set1_epi64x(long long __q)
3627{
3628  return (__m128i){ __q, __q };
3629}
3630
3631/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
3632///    specified 64-bit value.
3633///
3634/// \headerfile <x86intrin.h>
3635///
3636/// This intrinsic is a utility function and does not correspond to a specific
3637///    instruction.
3638///
3639/// \param __q
3640///    A 64-bit value used to initialize the elements of the destination integer
3641///    vector.
3642/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3643///    containing the value provided in the operand.
3644static __inline__ __m128i __DEFAULT_FN_ATTRS
3645_mm_set1_epi64(__m64 __q)
3646{
3647  return (__m128i){ (long long)__q, (long long)__q };
3648}
3649
3650/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
3651///    specified 32-bit value.
3652///
3653/// \headerfile <x86intrin.h>
3654///
3655/// This intrinsic is a utility function and does not correspond to a specific
3656///    instruction.
3657///
3658/// \param __i
3659///    A 32-bit value used to initialize the elements of the destination integer
3660///    vector.
3661/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3662///    containing the value provided in the operand.
3663static __inline__ __m128i __DEFAULT_FN_ATTRS
3664_mm_set1_epi32(int __i)
3665{
3666  return (__m128i)(__v4si){ __i, __i, __i, __i };
3667}
3668
3669/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
3670///    specified 16-bit value.
3671///
3672/// \headerfile <x86intrin.h>
3673///
3674/// This intrinsic is a utility function and does not correspond to a specific
3675///    instruction.
3676///
3677/// \param __w
3678///    A 16-bit value used to initialize the elements of the destination integer
3679///    vector.
3680/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3681///    containing the value provided in the operand.
3682static __inline__ __m128i __DEFAULT_FN_ATTRS
3683_mm_set1_epi16(short __w)
3684{
3685  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
3686}
3687
3688/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
3689///    specified 8-bit value.
3690///
3691/// \headerfile <x86intrin.h>
3692///
3693/// This intrinsic is a utility function and does not correspond to a specific
3694///    instruction.
3695///
3696/// \param __b
3697///    An 8-bit value used to initialize the elements of the destination integer
3698///    vector.
3699/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3700///    containing the value provided in the operand.
3701static __inline__ __m128i __DEFAULT_FN_ATTRS
3702_mm_set1_epi8(char __b)
3703{
3704  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
3705}
3706
3707/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3708///     with the specified 64-bit integral values.
3709///
3710/// \headerfile <x86intrin.h>
3711///
3712/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
3713///
3714/// \param __q0
3715///    A 64-bit integral value used to initialize the lower 64 bits of the
3716///    result.
3717/// \param __q1
3718///    A 64-bit integral value used to initialize the upper 64 bits of the
3719///    result.
3720/// \returns An initialized 128-bit integer vector.
3721static __inline__ __m128i __DEFAULT_FN_ATTRS
3722_mm_setr_epi64(__m64 __q0, __m64 __q1)
3723{
3724  return (__m128i){ (long long)__q0, (long long)__q1 };
3725}
3726
3727/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3728///     with the specified 32-bit integral values.
3729///
3730/// \headerfile <x86intrin.h>
3731///
3732/// This intrinsic is a utility function and does not correspond to a specific
3733///    instruction.
3734///
3735/// \param __i0
3736///    A 32-bit integral value used to initialize bits [31:0] of the result.
3737/// \param __i1
3738///    A 32-bit integral value used to initialize bits [63:32] of the result.
3739/// \param __i2
3740///    A 32-bit integral value used to initialize bits [95:64] of the result.
3741/// \param __i3
3742///    A 32-bit integral value used to initialize bits [127:96] of the result.
3743/// \returns An initialized 128-bit integer vector.
3744static __inline__ __m128i __DEFAULT_FN_ATTRS
3745_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3746{
3747  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3748}
3749
3750/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3751///     with the specified 16-bit integral values.
3752///
3753/// \headerfile <x86intrin.h>
3754///
3755/// This intrinsic is a utility function and does not correspond to a specific
3756///    instruction.
3757///
3758/// \param __w0
3759///    A 16-bit integral value used to initialize bits [15:0] of the result.
3760/// \param __w1
3761///    A 16-bit integral value used to initialize bits [31:16] of the result.
3762/// \param __w2
3763///    A 16-bit integral value used to initialize bits [47:32] of the result.
3764/// \param __w3
3765///    A 16-bit integral value used to initialize bits [63:48] of the result.
3766/// \param __w4
3767///    A 16-bit integral value used to initialize bits [79:64] of the result.
3768/// \param __w5
3769///    A 16-bit integral value used to initialize bits [95:80] of the result.
3770/// \param __w6
3771///    A 16-bit integral value used to initialize bits [111:96] of the result.
3772/// \param __w7
3773///    A 16-bit integral value used to initialize bits [127:112] of the result.
3774/// \returns An initialized 128-bit integer vector.
3775static __inline__ __m128i __DEFAULT_FN_ATTRS
3776_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3777{
3778  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3779}
3780
3781/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3782///     with the specified 8-bit integral values.
3783///
3784/// \headerfile <x86intrin.h>
3785///
3786/// This intrinsic is a utility function and does not correspond to a specific
3787///    instruction.
3788///
3789/// \param __b0
3790///    An 8-bit integral value used to initialize bits [7:0] of the result.
3791/// \param __b1
3792///    An 8-bit integral value used to initialize bits [15:8] of the result.
3793/// \param __b2
3794///    An 8-bit integral value used to initialize bits [23:16] of the result.
3795/// \param __b3
3796///    An 8-bit integral value used to initialize bits [31:24] of the result.
3797/// \param __b4
3798///    An 8-bit integral value used to initialize bits [39:32] of the result.
3799/// \param __b5
3800///    An 8-bit integral value used to initialize bits [47:40] of the result.
3801/// \param __b6
3802///    An 8-bit integral value used to initialize bits [55:48] of the result.
3803/// \param __b7
3804///    An 8-bit integral value used to initialize bits [63:56] of the result.
3805/// \param __b8
3806///    An 8-bit integral value used to initialize bits [71:64] of the result.
3807/// \param __b9
3808///    An 8-bit integral value used to initialize bits [79:72] of the result.
3809/// \param __b10
3810///    An 8-bit integral value used to initialize bits [87:80] of the result.
3811/// \param __b11
3812///    An 8-bit integral value used to initialize bits [95:88] of the result.
3813/// \param __b12
3814///    An 8-bit integral value used to initialize bits [103:96] of the result.
3815/// \param __b13
3816///    An 8-bit integral value used to initialize bits [111:104] of the result.
3817/// \param __b14
3818///    An 8-bit integral value used to initialize bits [119:112] of the result.
3819/// \param __b15
3820///    An 8-bit integral value used to initialize bits [127:120] of the result.
3821/// \returns An initialized 128-bit integer vector.
3822static __inline__ __m128i __DEFAULT_FN_ATTRS
3823_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3824{
3825  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3826}
3827
3828/// \brief Creates a 128-bit integer vector initialized to zero.
3829///
3830/// \headerfile <x86intrin.h>
3831///
3832/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
3833///
3834/// \returns An initialized 128-bit integer vector with all elements set to
3835///    zero.
3836static __inline__ __m128i __DEFAULT_FN_ATTRS
3837_mm_setzero_si128(void)
3838{
3839  return (__m128i){ 0LL, 0LL };
3840}
3841
3842/// \brief Stores a 128-bit integer vector to a memory location aligned on a
3843///    128-bit boundary.
3844///
3845/// \headerfile <x86intrin.h>
3846///
3847/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
3848///
3849/// \param __p
3850///    A pointer to an aligned memory location that will receive the integer
3851///    values.
3852/// \param __b
3853///    A 128-bit integer vector containing the values to be moved.
3854static __inline__ void __DEFAULT_FN_ATTRS
3855_mm_store_si128(__m128i *__p, __m128i __b)
3856{
3857  *__p = __b;
3858}
3859
3860/// \brief Stores a 128-bit integer vector to an unaligned memory location.
3861///
3862/// \headerfile <x86intrin.h>
3863///
3864/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
3865///
3866/// \param __p
3867///    A pointer to a memory location that will receive the integer values.
3868/// \param __b
3869///    A 128-bit integer vector containing the values to be moved.
3870static __inline__ void __DEFAULT_FN_ATTRS
3871_mm_storeu_si128(__m128i *__p, __m128i __b)
3872{
3873  struct __storeu_si128 {
3874    __m128i __v;
3875  } __attribute__((__packed__, __may_alias__));
3876  ((struct __storeu_si128*)__p)->__v = __b;
3877}
3878
3879/// \brief Moves bytes selected by the mask from the first operand to the
3880///    specified unaligned memory location. When a mask bit is 1, the
3881///    corresponding byte is written, otherwise it is not written. To minimize
3882///    caching, the date is flagged as non-temporal (unlikely to be used again
3883///    soon). Exception and trap behavior for elements not selected for storage
3884///    to memory are implementation dependent.
3885///
3886/// \headerfile <x86intrin.h>
3887///
3888/// This intrinsic corresponds to the \c VMASKMOVDQU / MASKMOVDQU instruction.
3889///
3890/// \param __d
3891///    A 128-bit integer vector containing the values to be moved.
3892/// \param __n
3893///    A 128-bit integer vector containing the mask. The most significant bit of
3894///    each byte represents the mask bits.
3895/// \param __p
3896///    A pointer to an unaligned 128-bit memory location where the specified
3897///    values are moved.
3898static __inline__ void __DEFAULT_FN_ATTRS
3899_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
3900{
3901  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3902}
3903
3904/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3905///    a memory location.
3906///
3907/// \headerfile <x86intrin.h>
3908///
3909/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
3910///
3911/// \param __p
3912///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3913///    of the integer vector parameter.
3914/// \param __a
3915///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3916///    value to be stored.
3917static __inline__ void __DEFAULT_FN_ATTRS
3918_mm_storel_epi64(__m128i *__p, __m128i __a)
3919{
3920  struct __mm_storel_epi64_struct {
3921    long long __u;
3922  } __attribute__((__packed__, __may_alias__));
3923  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
3924}
3925
3926/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3927///    aligned memory location. To minimize caching, the data is flagged as
3928///    non-temporal (unlikely to be used again soon).
3929///
3930/// \headerfile <x86intrin.h>
3931///
3932/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
3933///
3934/// \param __p
3935///    A pointer to the 128-bit aligned memory location used to store the value.
3936/// \param __a
3937///    A vector of [2 x double] containing the 64-bit values to be stored.
3938static __inline__ void __DEFAULT_FN_ATTRS
3939_mm_stream_pd(double *__p, __m128d __a)
3940{
3941  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
3942}
3943
3944/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
3945///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3946///    used again soon).
3947///
3948/// \headerfile <x86intrin.h>
3949///
3950/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
3951///
3952/// \param __p
3953///    A pointer to the 128-bit aligned memory location used to store the value.
3954/// \param __a
3955///    A 128-bit integer vector containing the values to be stored.
3956static __inline__ void __DEFAULT_FN_ATTRS
3957_mm_stream_si128(__m128i *__p, __m128i __a)
3958{
3959  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
3960}
3961
3962/// \brief Stores a 32-bit integer value in the specified memory location. To
3963///    minimize caching, the data is flagged as non-temporal (unlikely to be
3964///    used again soon).
3965///
3966/// \headerfile <x86intrin.h>
3967///
3968/// This intrinsic corresponds to the \c MOVNTI instruction.
3969///
3970/// \param __p
3971///    A pointer to the 32-bit memory location used to store the value.
3972/// \param __a
3973///    A 32-bit integer containing the value to be stored.
3974static __inline__ void __DEFAULT_FN_ATTRS
3975_mm_stream_si32(int *__p, int __a)
3976{
3977  __builtin_ia32_movnti(__p, __a);
3978}
3979
3980#ifdef __x86_64__
3981/// \brief Stores a 64-bit integer value in the specified memory location. To
3982///    minimize caching, the data is flagged as non-temporal (unlikely to be
3983///    used again soon).
3984///
3985/// \headerfile <x86intrin.h>
3986///
3987/// This intrinsic corresponds to the \c MOVNTIQ instruction.
3988///
3989/// \param __p
3990///    A pointer to the 64-bit memory location used to store the value.
3991/// \param __a
3992///    A 64-bit integer containing the value to be stored.
3993static __inline__ void __DEFAULT_FN_ATTRS
3994_mm_stream_si64(long long *__p, long long __a)
3995{
3996  __builtin_ia32_movnti64(__p, __a);
3997}
3998#endif
3999
4000#if defined(__cplusplus)
4001extern "C" {
4002#endif
4003
4004/// \brief The cache line containing __p is flushed and invalidated from all
4005///    caches in the coherency domain.
4006///
4007/// \headerfile <x86intrin.h>
4008///
4009/// This intrinsic corresponds to the \c CLFLUSH instruction.
4010///
4011/// \param __p
4012///    A pointer to the memory location used to identify the cache line to be
4013///    flushed.
4014void _mm_clflush(void const *);
4015
4016/// \brief Forces strong memory ordering (serialization) between load
4017///    instructions preceding this instruction and load instructions following
4018///    this instruction, ensuring the system completes all previous loads before
4019///    executing subsequent loads.
4020///
4021/// \headerfile <x86intrin.h>
4022///
4023/// This intrinsic corresponds to the \c LFENCE instruction.
4024///
4025void _mm_lfence(void);
4026
4027/// \brief Forces strong memory ordering (serialization) between load and store
4028///    instructions preceding this instruction and load and store instructions
4029///    following this instruction, ensuring that the system completes all
4030///    previous memory accesses before executing subsequent memory accesses.
4031///
4032/// \headerfile <x86intrin.h>
4033///
4034/// This intrinsic corresponds to the \c MFENCE instruction.
4035///
4036void _mm_mfence(void);
4037
4038#if defined(__cplusplus)
4039} // extern "C"
4040#endif
4041
4042/// \brief Converts 16-bit signed integers from both 128-bit integer vector
4043///    operands into 8-bit signed integers, and packs the results into the
4044///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4045///    Negative values less than 0x80 are saturated to 0x80.
4046///
4047/// \headerfile <x86intrin.h>
4048///
4049/// This intrinsic corresponds to the \c VPACKSSWB / PACKSSWB instruction.
4050///
4051/// \param __a
4052///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4053///   a signed integer and is converted to a 8-bit signed integer with
4054///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4055///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4056///   written to the lower 64 bits of the result.
4057/// \param __b
4058///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4059///   a signed integer and is converted to a 8-bit signed integer with
4060///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4061///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4062///   written to the higher 64 bits of the result.
4063/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4064static __inline__ __m128i __DEFAULT_FN_ATTRS
4065_mm_packs_epi16(__m128i __a, __m128i __b)
4066{
4067  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4068}
4069
4070/// \brief Converts 32-bit signed integers from both 128-bit integer vector
4071///    operands into 16-bit signed integers, and packs the results into the
4072///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4073///    Negative values less than 0x8000 are saturated to 0x8000.
4074///
4075/// \headerfile <x86intrin.h>
4076///
4077/// This intrinsic corresponds to the \c VPACKSSDW / PACKSSDW instruction.
4078///
4079/// \param __a
4080///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4081///    a signed integer and is converted to a 16-bit signed integer with
4082///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4083///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4084///    are written to the lower 64 bits of the result.
4085/// \param __b
4086///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4087///    a signed integer and is converted to a 16-bit signed integer with
4088///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4089///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4090///    are written to the higher 64 bits of the result.
4091/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4092static __inline__ __m128i __DEFAULT_FN_ATTRS
4093_mm_packs_epi32(__m128i __a, __m128i __b)
4094{
4095  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4096}
4097
4098/// \brief Converts 16-bit signed integers from both 128-bit integer vector
4099///    operands into 8-bit unsigned integers, and packs the results into the
4100///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4101///    than 0x00 are saturated to 0x00.
4102///
4103/// \headerfile <x86intrin.h>
4104///
4105/// This intrinsic corresponds to the \c VPACKUSWB / PACKUSWB instruction.
4106///
4107/// \param __a
4108///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4109///    a signed integer and is converted to an 8-bit unsigned integer with
4110///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4111///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4112///    written to the lower 64 bits of the result.
4113/// \param __b
4114///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4115///    a signed integer and is converted to an 8-bit unsigned integer with
4116///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4117///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4118///    written to the higher 64 bits of the result.
4119/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4120static __inline__ __m128i __DEFAULT_FN_ATTRS
4121_mm_packus_epi16(__m128i __a, __m128i __b)
4122{
4123  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4124}
4125
4126/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4127///    the immediate-value parameter as a selector.
4128///
4129/// \headerfile <x86intrin.h>
4130///
4131/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
4132///
4133/// \param __a
4134///    A 128-bit integer vector.
4135/// \param __imm
4136///    An immediate value. Bits [3:0] selects values from __a to be assigned to
4137///    bits[15:0] of the result.
4138///    000: assign values from bits [15:0] of __a.
4139///    001: assign values from bits [31:16] of __a.
4140///    010: assign values from bits [47:32] of __a.
4141///    011: assign values from bits [63:48] of __a.
4142///    100: assign values from bits [79:64] of __a.
4143///    101: assign values from bits [95:80] of __a.
4144///    110: assign values from bits [111:96] of __a.
4145///    111: assign values from bits [127:112] of __a.
4146/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4147///    integer vector parameter and the remaining bits are assigned zeros.
4148static __inline__ int __DEFAULT_FN_ATTRS
4149_mm_extract_epi16(__m128i __a, int __imm)
4150{
4151  __v8hi __b = (__v8hi)__a;
4152  return (unsigned short)__b[__imm & 7];
4153}
4154
4155/// \brief Constructs a 128-bit integer vector by first making a copy of the
4156///    128-bit integer vector parameter, and then inserting the lower 16 bits
4157///    of an integer parameter into an offset specified by the immediate-value
4158///    parameter.
4159///
4160/// \headerfile <x86intrin.h>
4161///
4162/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
4163///
4164/// \param __a
4165///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4166///    result and then one of the eight elements in the result is replaced by
4167///    the lower 16 bits of __b.
4168/// \param __b
4169///    An integer. The lower 16 bits of this parameter are written to the
4170///    result beginning at an offset specified by __imm.
4171/// \param __imm
4172///    An immediate value specifying the bit offset in the result at which the
4173///    lower 16 bits of__b are written.
4174/// \returns A 128-bit integer vector containing the constructed values.
4175static __inline__ __m128i __DEFAULT_FN_ATTRS
4176_mm_insert_epi16(__m128i __a, int __b, int __imm)
4177{
4178  __v8hi __c = (__v8hi)__a;
4179  __c[__imm & 7] = __b;
4180  return (__m128i)__c;
4181}
4182
4183/// \brief Copies the values of the most significant bits from each 8-bit
4184///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4185///    value, zero-extends the value, and writes it to the destination.
4186///
4187/// \headerfile <x86intrin.h>
4188///
4189/// This intrinsic corresponds to the \c VPMOVMSKB / PMOVMSKB instruction.
4190///
4191/// \param __a
4192///    A 128-bit integer vector containing the values with bits to be extracted.
4193/// \returns The most significant bits from each 8-bit element in __a, written
4194///    to bits [15:0]. The other bits are assigned zeros.
4195static __inline__ int __DEFAULT_FN_ATTRS
4196_mm_movemask_epi8(__m128i __a)
4197{
4198  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4199}
4200
4201/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
4202///    elements of a 128-bit integer vector parameter, using the immediate-value
4203///    parameter as a specifier.
4204///
4205/// \headerfile <x86intrin.h>
4206///
4207/// \code
4208/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4209/// \endcode
4210///
4211/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
4212///
4213/// \param a
4214///    A 128-bit integer vector containing the values to be copied.
4215/// \param imm
4216///    An immediate value containing an 8-bit value specifying which elements to
4217///    copy from a. The destinations within the 128-bit destination are assigned
4218///    values as follows:
4219///    Bits [1:0] are used to assign values to bits [31:0] of the result.
4220///    Bits [3:2] are used to assign values to bits [63:32] of the result.
4221///    Bits [5:4] are used to assign values to bits [95:64] of the result.
4222///    Bits [7:6] are used to assign values to bits [127:96] of the result.
4223///    Bit value assignments:
4224///    00: assign values from bits [31:0] of a.
4225///    01: assign values from bits [63:32] of a.
4226///    10: assign values from bits [95:64] of a.
4227///    11: assign values from bits [127:96] of a.
4228/// \returns A 128-bit integer vector containing the shuffled values.
4229#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
4230  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
4231                                   (__v4si)_mm_undefined_si128(), \
4232                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4233                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
4234
4235/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
4236///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4237///    value parameter as a specifier.
4238///
4239/// \headerfile <x86intrin.h>
4240///
4241/// \code
4242/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4243/// \endcode
4244///
4245/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
4246///
4247/// \param a
4248///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4249///    [127:64] of the result.
4250/// \param imm
4251///    An 8-bit immediate value specifying which elements to copy from a.
4252///    Bits[1:0] are used to assign values to bits [15:0] of the result.
4253///    Bits[3:2] are used to assign values to bits [31:16] of the result.
4254///    Bits[5:4] are used to assign values to bits [47:32] of the result.
4255///    Bits[7:6] are used to assign values to bits [63:48] of the result.
4256///    Bit value assignments:
4257///    00: assign values from bits [15:0] of a.
4258///    01: assign values from bits [31:16] of a.
4259///    10: assign values from bits [47:32] of a.
4260///    11: assign values from bits [63:48] of a.
4261/// \returns A 128-bit integer vector containing the shuffled values.
4262#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
4263  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4264                                   (__v8hi)_mm_undefined_si128(), \
4265                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4266                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
4267                                   4, 5, 6, 7); })
4268
4269/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
4270///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4271///    value parameter as a specifier.
4272///
4273/// \headerfile <x86intrin.h>
4274///
4275/// \code
4276/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4277/// \endcode
4278///
4279/// This intrinsic corresponds to the \c VPSHUFHW / PSHUFHW instruction.
4280///
4281/// \param a
4282///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4283///    [63:0] of the result.
4284/// \param imm
4285///    An 8-bit immediate value specifying which elements to copy from a.
4286///    Bits[1:0] are used to assign values to bits [79:64] of the result.
4287///    Bits[3:2] are used to assign values to bits [95:80] of the result.
4288///    Bits[5:4] are used to assign values to bits [111:96] of the result.
4289///    Bits[7:6] are used to assign values to bits [127:112] of the result.
4290///    Bit value assignments:
4291///    00: assign values from bits [79:64] of a.
4292///    01: assign values from bits [95:80] of a.
4293///    10: assign values from bits [111:96] of a.
4294///    11: assign values from bits [127:112] of a.
4295/// \returns A 128-bit integer vector containing the shuffled values.
4296#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
4297  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4298                                   (__v8hi)_mm_undefined_si128(), \
4299                                   0, 1, 2, 3, \
4300                                   4 + (((imm) >> 0) & 0x3), \
4301                                   4 + (((imm) >> 2) & 0x3), \
4302                                   4 + (((imm) >> 4) & 0x3), \
4303                                   4 + (((imm) >> 6) & 0x3)); })
4304
4305/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
4306///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4307///
4308/// \headerfile <x86intrin.h>
4309///
4310/// This intrinsic corresponds to the \c VPUNPCKHBW / PUNPCKHBW instruction.
4311///
4312/// \param __a
4313///    A 128-bit vector of [16 x i8].
4314///    Bits [71:64] are written to bits [7:0] of the result
4315///    Bits [79:72] are written to bits [23:16] of the result.
4316///    Bits [87:80] are written to bits [39:32] of the result.
4317///    Bits [95:88] are written to bits [55:48] of the result.
4318///    Bits [103:96] are written to bits [71:64] of the result.
4319///    Bits [111:104] are written to bits [87:80] of the result.
4320///    Bits [119:112] are written to bits [103:96] of the result.
4321///    Bits [127:120] are written to bits [119:112] of the result.
4322/// \param __b
4323///    A 128-bit vector of [16 x i8].
4324///    Bits [71:64] are written to bits [15:8] of the result.
4325///    Bits [79:72] are written to bits [31:24] of the result.
4326///    Bits [87:80] are written to bits [47:40] of the result.
4327///    Bits [95:88] are written to bits [63:56] of the result.
4328///    Bits [103:96] are written to bits [79:72] of the result.
4329///    Bits [111:104] are written to bits [95:88] of the result.
4330///    Bits [119:112] are written to bits [111:104] of the result.
4331///    Bits [127:120] are written to bits [127:120] of the destination.
4332/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4333static __inline__ __m128i __DEFAULT_FN_ATTRS
4334_mm_unpackhi_epi8(__m128i __a, __m128i __b)
4335{
4336  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4337}
4338
4339/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4340///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4341///
4342/// \headerfile <x86intrin.h>
4343///
4344/// This intrinsic corresponds to the \c VPUNPCKHWD / PUNPCKHWD instruction.
4345///
4346/// \param __a
4347///    A 128-bit vector of [8 x i16].
4348///    Bits [79:64] are written to bits [15:0] of the result.
4349///    Bits [95:80] are written to bits [47:32] of the result.
4350///    Bits [111:96] are written to bits [79:64] of the result.
4351///    Bits [127:112] are written to bits [111:96] of the result.
4352/// \param __b
4353///    A 128-bit vector of [8 x i16].
4354///    Bits [79:64] are written to bits [31:16] of the result.
4355///    Bits [95:80] are written to bits [63:48] of the result.
4356///    Bits [111:96] are written to bits [95:80] of the result.
4357///    Bits [127:112] are written to bits [127:112] of the result.
4358/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4359static __inline__ __m128i __DEFAULT_FN_ATTRS
4360_mm_unpackhi_epi16(__m128i __a, __m128i __b)
4361{
4362  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4363}
4364
4365/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4366///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4367///
4368/// \headerfile <x86intrin.h>
4369///
4370/// This intrinsic corresponds to the \c VPUNPCKHDQ / PUNPCKHDQ instruction.
4371///
4372/// \param __a
4373///    A 128-bit vector of [4 x i32].
4374///    Bits [95:64] are written to bits [31:0] of the destination.
4375///    Bits [127:96] are written to bits [95:64] of the destination.
4376/// \param __b
4377///    A 128-bit vector of [4 x i32].
4378///    Bits [95:64] are written to bits [64:32] of the destination.
4379///    Bits [127:96] are written to bits [127:96] of the destination.
4380/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4381static __inline__ __m128i __DEFAULT_FN_ATTRS
4382_mm_unpackhi_epi32(__m128i __a, __m128i __b)
4383{
4384  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4385}
4386
4387/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4388///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4389///
4390/// \headerfile <x86intrin.h>
4391///
4392/// This intrinsic corresponds to the \c VPUNPCKHQDQ / PUNPCKHQDQ instruction.
4393///
4394/// \param __a
4395///    A 128-bit vector of [2 x i64].
4396///    Bits [127:64] are written to bits [63:0] of the destination.
4397/// \param __b
4398///    A 128-bit vector of [2 x i64].
4399///    Bits [127:64] are written to bits [127:64] of the destination.
4400/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4401static __inline__ __m128i __DEFAULT_FN_ATTRS
4402_mm_unpackhi_epi64(__m128i __a, __m128i __b)
4403{
4404  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4405}
4406
4407/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4408///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4409///
4410/// \headerfile <x86intrin.h>
4411///
4412/// This intrinsic corresponds to the \c VPUNPCKLBW / PUNPCKLBW instruction.
4413///
4414/// \param __a
4415///    A 128-bit vector of [16 x i8].
4416///    Bits [7:0] are written to bits [7:0] of the result.
4417///    Bits [15:8] are written to bits [23:16] of the result.
4418///    Bits [23:16] are written to bits [39:32] of the result.
4419///    Bits [31:24] are written to bits [55:48] of the result.
4420///    Bits [39:32] are written to bits [71:64] of the result.
4421///    Bits [47:40] are written to bits [87:80] of the result.
4422///    Bits [55:48] are written to bits [103:96] of the result.
4423///    Bits [63:56] are written to bits [119:112] of the destination.
4424/// \param __b
4425///    A 128-bit vector of [16 x i8].
4426///    Bits [7:0] are written to bits [15:8] of the result.
4427///    Bits [15:8] are written to bits [31:24] of the result.
4428///    Bits [23:16] are written to bits [47:40] of the result.
4429///    Bits [31:24] are written to bits [63:56] of the result.
4430///    Bits [39:32] are written to bits [79:72] of the result.
4431///    Bits [47:40] are written to bits [95:88] of the result.
4432///    Bits [55:48] are written to bits [111:104] of the result.
4433///    Bits [63:56] are written to bits [127:120] of the result.
4434/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4435static __inline__ __m128i __DEFAULT_FN_ATTRS
4436_mm_unpacklo_epi8(__m128i __a, __m128i __b)
4437{
4438  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4439}
4440
4441/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
4442///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4443///    [8 x i16].
4444///
4445/// \headerfile <x86intrin.h>
4446///
4447/// This intrinsic corresponds to the \c VPUNPCKLWD / PUNPCKLWD instruction.
4448///
4449/// \param __a
4450///    A 128-bit vector of [8 x i16].
4451///    Bits [15:0] are written to bits [15:0] of the result.
4452///    Bits [31:16] are written to bits [47:32] of the result.
4453///    Bits [47:32] are written to bits [79:64] of the result.
4454///    Bits [63:48] are written to bits [111:96] of the result.
4455/// \param __b
4456///    A 128-bit vector of [8 x i16].
4457///    Bits [15:0] are written to bits [31:16] of the result.
4458///    Bits [31:16] are written to bits [63:48] of the result.
4459///    Bits [47:32] are written to bits [95:80] of the result.
4460///    Bits [63:48] are written to bits [127:112] of the result.
4461/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4462static __inline__ __m128i __DEFAULT_FN_ATTRS
4463_mm_unpacklo_epi16(__m128i __a, __m128i __b)
4464{
4465  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4466}
4467
4468/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4469///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4470///
4471/// \headerfile <x86intrin.h>
4472///
4473/// This intrinsic corresponds to the \c VPUNPCKLDQ / PUNPCKLDQ instruction.
4474///
4475/// \param __a
4476///    A 128-bit vector of [4 x i32].
4477///    Bits [31:0] are written to bits [31:0] of the destination.
4478///    Bits [63:32] are written to bits [95:64] of the destination.
4479/// \param __b
4480///    A 128-bit vector of [4 x i32].
4481///    Bits [31:0] are written to bits [64:32] of the destination.
4482///    Bits [63:32] are written to bits [127:96] of the destination.
4483/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4484static __inline__ __m128i __DEFAULT_FN_ATTRS
4485_mm_unpacklo_epi32(__m128i __a, __m128i __b)
4486{
4487  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4488}
4489
4490/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
4491///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4492///
4493/// \headerfile <x86intrin.h>
4494///
4495/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
4496///
4497/// \param __a
4498///    A 128-bit vector of [2 x i64].
4499///    Bits [63:0] are written to bits [63:0] of the destination.
4500/// \param __b
4501///    A 128-bit vector of [2 x i64].
4502///    Bits [63:0] are written to bits [127:64] of the destination.
4503/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4504static __inline__ __m128i __DEFAULT_FN_ATTRS
4505_mm_unpacklo_epi64(__m128i __a, __m128i __b)
4506{
4507  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4508}
4509
4510/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4511///    integer.
4512///
4513/// \headerfile <x86intrin.h>
4514///
4515/// This intrinsic has no corresponding instruction.
4516///
4517/// \param __a
4518///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4519///    destination.
4520/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4521static __inline__ __m64 __DEFAULT_FN_ATTRS
4522_mm_movepi64_pi64(__m128i __a)
4523{
4524  return (__m64)__a[0];
4525}
4526
4527/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4528///    upper bits.
4529///
4530/// \headerfile <x86intrin.h>
4531///
4532/// This intrinsic corresponds to the \c VMOVQ / MOVQ / MOVD instruction.
4533///
4534/// \param __a
4535///    A 64-bit value.
4536/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4537///    the operand. The upper 64 bits are assigned zeros.
4538static __inline__ __m128i __DEFAULT_FN_ATTRS
4539_mm_movpi64_epi64(__m64 __a)
4540{
4541  return (__m128i){ (long long)__a, 0 };
4542}
4543
4544/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4545///    integer vector, zeroing the upper bits.
4546///
4547/// \headerfile <x86intrin.h>
4548///
4549/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
4550///
4551/// \param __a
4552///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4553///    destination.
4554/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4555///    the operand. The upper 64 bits are assigned zeros.
4556static __inline__ __m128i __DEFAULT_FN_ATTRS
4557_mm_move_epi64(__m128i __a)
4558{
4559  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
4560}
4561
4562/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4563///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4564///    double].
4565///
4566/// \headerfile <x86intrin.h>
4567///
4568/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
4569///
4570/// \param __a
4571///    A 128-bit vector of [2 x double].
4572///    Bits [127:64] are written to bits [63:0] of the destination.
4573/// \param __b
4574///    A 128-bit vector of [2 x double].
4575///    Bits [127:64] are written to bits [127:64] of the destination.
4576/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4577static __inline__ __m128d __DEFAULT_FN_ATTRS
4578_mm_unpackhi_pd(__m128d __a, __m128d __b)
4579{
4580  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4581}
4582
4583/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
4584///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4585///    double].
4586///
4587/// \headerfile <x86intrin.h>
4588///
4589/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
4590///
4591/// \param __a
4592///    A 128-bit vector of [2 x double].
4593///    Bits [63:0] are written to bits [63:0] of the destination.
4594/// \param __b
4595///    A 128-bit vector of [2 x double].
4596///    Bits [63:0] are written to bits [127:64] of the destination.
4597/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4598static __inline__ __m128d __DEFAULT_FN_ATTRS
4599_mm_unpacklo_pd(__m128d __a, __m128d __b)
4600{
4601  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4602}
4603
4604/// \brief Extracts the sign bits of the double-precision values in the 128-bit
4605///    vector of [2 x double], zero-extends the value, and writes it to the
4606///    low-order bits of the destination.
4607///
4608/// \headerfile <x86intrin.h>
4609///
4610/// This intrinsic corresponds to the \c VMOVMSKPD / MOVMSKPD instruction.
4611///
4612/// \param __a
4613///    A 128-bit vector of [2 x double] containing the values with sign bits to
4614///    be extracted.
4615/// \returns The sign bits from each of the double-precision elements in __a,
4616///    written to bits [1:0]. The remaining bits are assigned values of zero.
4617static __inline__ int __DEFAULT_FN_ATTRS
4618_mm_movemask_pd(__m128d __a)
4619{
4620  return __builtin_ia32_movmskpd((__v2df)__a);
4621}
4622
4623
4624/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
4625///    128-bit vector parameters of [2 x double], using the immediate-value
4626///     parameter as a specifier.
4627///
4628/// \headerfile <x86intrin.h>
4629///
4630/// \code
4631/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632/// \endcode
4633///
4634/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
4635///
4636/// \param a
4637///    A 128-bit vector of [2 x double].
4638/// \param b
4639///    A 128-bit vector of [2 x double].
4640/// \param i
4641///    An 8-bit immediate value. The least significant two bits specify which
4642///    elements to copy from a and b:
4643///    Bit[0] = 0: lower element of a copied to lower element of result.
4644///    Bit[0] = 1: upper element of a copied to lower element of result.
4645///    Bit[1] = 0: lower element of b copied to upper element of result.
4646///    Bit[1] = 1: upper element of b copied to upper element of result.
4647/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4648#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
4649  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4650                                   0 + (((i) >> 0) & 0x1), \
4651                                   2 + (((i) >> 1) & 0x1)); })
4652
4653/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4654///    floating-point vector of [4 x float].
4655///
4656/// \headerfile <x86intrin.h>
4657///
4658/// This intrinsic has no corresponding instruction.
4659///
4660/// \param __a
4661///    A 128-bit floating-point vector of [2 x double].
4662/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4663///    bitwise pattern as the parameter.
4664static __inline__ __m128 __DEFAULT_FN_ATTRS
4665_mm_castpd_ps(__m128d __a)
4666{
4667  return (__m128)__a;
4668}
4669
4670/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671///    integer vector.
4672///
4673/// \headerfile <x86intrin.h>
4674///
4675/// This intrinsic has no corresponding instruction.
4676///
4677/// \param __a
4678///    A 128-bit floating-point vector of [2 x double].
4679/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680///    parameter.
4681static __inline__ __m128i __DEFAULT_FN_ATTRS
4682_mm_castpd_si128(__m128d __a)
4683{
4684  return (__m128i)__a;
4685}
4686
4687/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4688///    floating-point vector of [2 x double].
4689///
4690/// \headerfile <x86intrin.h>
4691///
4692/// This intrinsic has no corresponding instruction.
4693///
4694/// \param __a
4695///    A 128-bit floating-point vector of [4 x float].
4696/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4697///    bitwise pattern as the parameter.
4698static __inline__ __m128d __DEFAULT_FN_ATTRS
4699_mm_castps_pd(__m128 __a)
4700{
4701  return (__m128d)__a;
4702}
4703
4704/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4705///    integer vector.
4706///
4707/// \headerfile <x86intrin.h>
4708///
4709/// This intrinsic has no corresponding instruction.
4710///
4711/// \param __a
4712///    A 128-bit floating-point vector of [4 x float].
4713/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4714///    parameter.
4715static __inline__ __m128i __DEFAULT_FN_ATTRS
4716_mm_castps_si128(__m128 __a)
4717{
4718  return (__m128i)__a;
4719}
4720
4721/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4722///    of [4 x float].
4723///
4724/// \headerfile <x86intrin.h>
4725///
4726/// This intrinsic has no corresponding instruction.
4727///
4728/// \param __a
4729///    A 128-bit integer vector.
4730/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4731///    bitwise pattern as the parameter.
4732static __inline__ __m128 __DEFAULT_FN_ATTRS
4733_mm_castsi128_ps(__m128i __a)
4734{
4735  return (__m128)__a;
4736}
4737
4738/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4739///    of [2 x double].
4740///
4741/// \headerfile <x86intrin.h>
4742///
4743/// This intrinsic has no corresponding instruction.
4744///
4745/// \param __a
4746///    A 128-bit integer vector.
4747/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4748///    bitwise pattern as the parameter.
4749static __inline__ __m128d __DEFAULT_FN_ATTRS
4750_mm_castsi128_pd(__m128i __a)
4751{
4752  return (__m128d)__a;
4753}
4754
4755/// \brief Indicates that a spin loop is being executed for the purposes of
4756///    optimizing power consumption during the loop.
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// This intrinsic corresponds to the \c PAUSE instruction.
4761///
4762#if defined(__cplusplus)
4763extern "C"
4764#endif
4765void _mm_pause(void);
4766
4767#undef __DEFAULT_FN_ATTRS
4768
4769#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4770
4771#endif /* __EMMINTRIN_H */
4772