1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#include <xmmintrin.h>
28
29typedef double __m128d __attribute__((__vector_size__(16)));
30typedef long long __m128i __attribute__((__vector_size__(16)));
31
32/* Type defines.  */
33typedef double __v2df __attribute__ ((__vector_size__ (16)));
34typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35typedef short __v8hi __attribute__((__vector_size__(16)));
36typedef char __v16qi __attribute__((__vector_size__(16)));
37
38/* Unsigned types */
39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42
43/* We need an explicitly signed variant for char. Note that this shouldn't
44 * appear in the interface though. */
45typedef signed char __v16qs __attribute__((__vector_size__(16)));
46
47#include <f16cintrin.h>
48
49/* Define the default attributes for the functions in this file. */
50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
51
52/// \brief Adds lower double-precision values in both operands and returns the
53///    sum in the lower 64 bits of the result. The upper 64 bits of the result
54///    are copied from the upper double-precision value of the first operand.
55///
56/// \headerfile <x86intrin.h>
57///
58/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
59///
60/// \param __a
61///    A 128-bit vector of [2 x double] containing one of the source operands.
62/// \param __b
63///    A 128-bit vector of [2 x double] containing one of the source operands.
64/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
65///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
66///    from the upper 64 bits of the first source operand.
67static __inline__ __m128d __DEFAULT_FN_ATTRS
68_mm_add_sd(__m128d __a, __m128d __b)
69{
70  __a[0] += __b[0];
71  return __a;
72}
73
74/// \brief Adds two 128-bit vectors of [2 x double].
75///
76/// \headerfile <x86intrin.h>
77///
78/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
79///
80/// \param __a
81///    A 128-bit vector of [2 x double] containing one of the source operands.
82/// \param __b
83///    A 128-bit vector of [2 x double] containing one of the source operands.
84/// \returns A 128-bit vector of [2 x double] containing the sums of both
85///    operands.
86static __inline__ __m128d __DEFAULT_FN_ATTRS
87_mm_add_pd(__m128d __a, __m128d __b)
88{
89  return (__m128d)((__v2df)__a + (__v2df)__b);
90}
91
92/// \brief Subtracts the lower double-precision value of the second operand
93///    from the lower double-precision value of the first operand and returns
94///    the difference in the lower 64 bits of the result. The upper 64 bits of
95///    the result are copied from the upper double-precision value of the first
96///    operand.
97///
98/// \headerfile <x86intrin.h>
99///
100/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
101///
102/// \param __a
103///    A 128-bit vector of [2 x double] containing the minuend.
104/// \param __b
105///    A 128-bit vector of [2 x double] containing the subtrahend.
106/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
107///    difference of the lower 64 bits of both operands. The upper 64 bits are
108///    copied from the upper 64 bits of the first source operand.
109static __inline__ __m128d __DEFAULT_FN_ATTRS
110_mm_sub_sd(__m128d __a, __m128d __b)
111{
112  __a[0] -= __b[0];
113  return __a;
114}
115
116/// \brief Subtracts two 128-bit vectors of [2 x double].
117///
118/// \headerfile <x86intrin.h>
119///
120/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
121///
122/// \param __a
123///    A 128-bit vector of [2 x double] containing the minuend.
124/// \param __b
125///    A 128-bit vector of [2 x double] containing the subtrahend.
126/// \returns A 128-bit vector of [2 x double] containing the differences between
127///    both operands.
128static __inline__ __m128d __DEFAULT_FN_ATTRS
129_mm_sub_pd(__m128d __a, __m128d __b)
130{
131  return (__m128d)((__v2df)__a - (__v2df)__b);
132}
133
134/// \brief Multiplies lower double-precision values in both operands and returns
135///    the product in the lower 64 bits of the result. The upper 64 bits of the
136///    result are copied from the upper double-precision value of the first
137///    operand.
138///
139/// \headerfile <x86intrin.h>
140///
141/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
142///
143/// \param __a
144///    A 128-bit vector of [2 x double] containing one of the source operands.
145/// \param __b
146///    A 128-bit vector of [2 x double] containing one of the source operands.
147/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
148///    product of the lower 64 bits of both operands. The upper 64 bits are
149///    copied from the upper 64 bits of the first source operand.
150static __inline__ __m128d __DEFAULT_FN_ATTRS
151_mm_mul_sd(__m128d __a, __m128d __b)
152{
153  __a[0] *= __b[0];
154  return __a;
155}
156
157/// \brief Multiplies two 128-bit vectors of [2 x double].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
162///
163/// \param __a
164///    A 128-bit vector of [2 x double] containing one of the operands.
165/// \param __b
166///    A 128-bit vector of [2 x double] containing one of the operands.
167/// \returns A 128-bit vector of [2 x double] containing the products of both
168///    operands.
169static __inline__ __m128d __DEFAULT_FN_ATTRS
170_mm_mul_pd(__m128d __a, __m128d __b)
171{
172  return (__m128d)((__v2df)__a * (__v2df)__b);
173}
174
175/// \brief Divides the lower double-precision value of the first operand by the
176///    lower double-precision value of the second operand and returns the
177///    quotient in the lower 64 bits of the result. The upper 64 bits of the
178///    result are copied from the upper double-precision value of the first
179///    operand.
180///
181/// \headerfile <x86intrin.h>
182///
183/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
184///
185/// \param __a
186///    A 128-bit vector of [2 x double] containing the dividend.
187/// \param __b
188///    A 128-bit vector of [2 x double] containing divisor.
189/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
190///    quotient of the lower 64 bits of both operands. The upper 64 bits are
191///    copied from the upper 64 bits of the first source operand.
192static __inline__ __m128d __DEFAULT_FN_ATTRS
193_mm_div_sd(__m128d __a, __m128d __b)
194{
195  __a[0] /= __b[0];
196  return __a;
197}
198
199/// \brief Performs an element-by-element division of two 128-bit vectors of
200///    [2 x double].
201///
202/// \headerfile <x86intrin.h>
203///
204/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205///
206/// \param __a
207///    A 128-bit vector of [2 x double] containing the dividend.
208/// \param __b
209///    A 128-bit vector of [2 x double] containing the divisor.
210/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211///    operands.
212static __inline__ __m128d __DEFAULT_FN_ATTRS
213_mm_div_pd(__m128d __a, __m128d __b)
214{
215  return (__m128d)((__v2df)__a / (__v2df)__b);
216}
217
218/// \brief Calculates the square root of the lower double-precision value of
219///    the second operand and returns it in the lower 64 bits of the result.
220///    The upper 64 bits of the result are copied from the upper double-
221///    precision value of the first operand.
222///
223/// \headerfile <x86intrin.h>
224///
225/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
226///
227/// \param __a
228///    A 128-bit vector of [2 x double] containing one of the operands. The
229///    upper 64 bits of this operand are copied to the upper 64 bits of the
230///    result.
231/// \param __b
232///    A 128-bit vector of [2 x double] containing one of the operands. The
233///    square root is calculated using the lower 64 bits of this operand.
234/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
235///    square root of the lower 64 bits of operand \a __b, and whose upper 64
236///    bits are copied from the upper 64 bits of operand \a __a.
237static __inline__ __m128d __DEFAULT_FN_ATTRS
238_mm_sqrt_sd(__m128d __a, __m128d __b)
239{
240  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
241  return (__m128d) { __c[0], __a[1] };
242}
243
244/// \brief Calculates the square root of the each of two values stored in a
245///    128-bit vector of [2 x double].
246///
247/// \headerfile <x86intrin.h>
248///
249/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
250///
251/// \param __a
252///    A 128-bit vector of [2 x double].
253/// \returns A 128-bit vector of [2 x double] containing the square roots of the
254///    values in the operand.
255static __inline__ __m128d __DEFAULT_FN_ATTRS
256_mm_sqrt_pd(__m128d __a)
257{
258  return __builtin_ia32_sqrtpd((__v2df)__a);
259}
260
261/// \brief Compares lower 64-bit double-precision values of both operands, and
262///    returns the lesser of the pair of values in the lower 64-bits of the
263///    result. The upper 64 bits of the result are copied from the upper double-
264///    precision value of the first operand.
265///
266/// \headerfile <x86intrin.h>
267///
268/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
269///
270/// \param __a
271///    A 128-bit vector of [2 x double] containing one of the operands. The
272///    lower 64 bits of this operand are used in the comparison.
273/// \param __b
274///    A 128-bit vector of [2 x double] containing one of the operands. The
275///    lower 64 bits of this operand are used in the comparison.
276/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
277///    minimum value between both operands. The upper 64 bits are copied from
278///    the upper 64 bits of the first source operand.
279static __inline__ __m128d __DEFAULT_FN_ATTRS
280_mm_min_sd(__m128d __a, __m128d __b)
281{
282  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
283}
284
285/// \brief Performs element-by-element comparison of the two 128-bit vectors of
286///    [2 x double] and returns the vector containing the lesser of each pair of
287///    values.
288///
289/// \headerfile <x86intrin.h>
290///
291/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
292///
293/// \param __a
294///    A 128-bit vector of [2 x double] containing one of the operands.
295/// \param __b
296///    A 128-bit vector of [2 x double] containing one of the operands.
297/// \returns A 128-bit vector of [2 x double] containing the minimum values
298///    between both operands.
299static __inline__ __m128d __DEFAULT_FN_ATTRS
300_mm_min_pd(__m128d __a, __m128d __b)
301{
302  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
303}
304
305/// \brief Compares lower 64-bits double-precision values of both operands, and
306///    returns the greater of the pair of values in the lower 64-bits of the
307///    result. The upper 64 bits of the result are copied from the upper double-
308///    precision value of the first operand.
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
313///
314/// \param __a
315///    A 128-bit vector of [2 x double] containing one of the operands. The
316///    lower 64 bits of this operand are used in the comparison.
317/// \param __b
318///    A 128-bit vector of [2 x double] containing one of the operands. The
319///    lower 64 bits of this operand are used in the comparison.
320/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321///    maximum value between both operands. The upper 64 bits are copied from
322///    the upper 64 bits of the first source operand.
323static __inline__ __m128d __DEFAULT_FN_ATTRS
324_mm_max_sd(__m128d __a, __m128d __b)
325{
326  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
327}
328
329/// \brief Performs element-by-element comparison of the two 128-bit vectors of
330///    [2 x double] and returns the vector containing the greater of each pair
331///    of values.
332///
333/// \headerfile <x86intrin.h>
334///
335/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
336///
337/// \param __a
338///    A 128-bit vector of [2 x double] containing one of the operands.
339/// \param __b
340///    A 128-bit vector of [2 x double] containing one of the operands.
341/// \returns A 128-bit vector of [2 x double] containing the maximum values
342///    between both operands.
343static __inline__ __m128d __DEFAULT_FN_ATTRS
344_mm_max_pd(__m128d __a, __m128d __b)
345{
346  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
347}
348
349/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
354///
355/// \param __a
356///    A 128-bit vector of [2 x double] containing one of the source operands.
357/// \param __b
358///    A 128-bit vector of [2 x double] containing one of the source operands.
359/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360///    values between both operands.
361static __inline__ __m128d __DEFAULT_FN_ATTRS
362_mm_and_pd(__m128d __a, __m128d __b)
363{
364  return (__m128d)((__v2du)__a & (__v2du)__b);
365}
366
367/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
368///    the one's complement of the values contained in the first source operand.
369///
370/// \headerfile <x86intrin.h>
371///
372/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
373///
374/// \param __a
375///    A 128-bit vector of [2 x double] containing the left source operand. The
376///    one's complement of this value is used in the bitwise AND.
377/// \param __b
378///    A 128-bit vector of [2 x double] containing the right source operand.
379/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
380///    values in the second operand and the one's complement of the first
381///    operand.
382static __inline__ __m128d __DEFAULT_FN_ATTRS
383_mm_andnot_pd(__m128d __a, __m128d __b)
384{
385  return (__m128d)(~(__v2du)__a & (__v2du)__b);
386}
387
388/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
389///
390/// \headerfile <x86intrin.h>
391///
392/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
393///
394/// \param __a
395///    A 128-bit vector of [2 x double] containing one of the source operands.
396/// \param __b
397///    A 128-bit vector of [2 x double] containing one of the source operands.
398/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
399///    values between both operands.
400static __inline__ __m128d __DEFAULT_FN_ATTRS
401_mm_or_pd(__m128d __a, __m128d __b)
402{
403  return (__m128d)((__v2du)__a | (__v2du)__b);
404}
405
406/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
407///
408/// \headerfile <x86intrin.h>
409///
410/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
411///
412/// \param __a
413///    A 128-bit vector of [2 x double] containing one of the source operands.
414/// \param __b
415///    A 128-bit vector of [2 x double] containing one of the source operands.
416/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
417///    values between both operands.
418static __inline__ __m128d __DEFAULT_FN_ATTRS
419_mm_xor_pd(__m128d __a, __m128d __b)
420{
421  return (__m128d)((__v2du)__a ^ (__v2du)__b);
422}
423
424/// \brief Compares each of the corresponding double-precision values of the
425///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
426///    for false, FFFFFFFFFFFFFFFFh for true.
427///
428/// \headerfile <x86intrin.h>
429///
430/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
431///
432/// \param __a
433///    A 128-bit vector of [2 x double].
434/// \param __b
435///    A 128-bit vector of [2 x double].
436/// \returns A 128-bit vector containing the comparison results.
437static __inline__ __m128d __DEFAULT_FN_ATTRS
438_mm_cmpeq_pd(__m128d __a, __m128d __b)
439{
440  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
441}
442
443/// \brief Compares each of the corresponding double-precision values of the
444///    128-bit vectors of [2 x double] to determine if the values in the first
445///    operand are less than those in the second operand. Each comparison
446///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
447///
448/// \headerfile <x86intrin.h>
449///
450/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
451///
452/// \param __a
453///    A 128-bit vector of [2 x double].
454/// \param __b
455///    A 128-bit vector of [2 x double].
456/// \returns A 128-bit vector containing the comparison results.
457static __inline__ __m128d __DEFAULT_FN_ATTRS
458_mm_cmplt_pd(__m128d __a, __m128d __b)
459{
460  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
461}
462
463/// \brief Compares each of the corresponding double-precision values of the
464///    128-bit vectors of [2 x double] to determine if the values in the first
465///    operand are less than or equal to those in the second operand. Each
466///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
467///
468/// \headerfile <x86intrin.h>
469///
470/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
471///
472/// \param __a
473///    A 128-bit vector of [2 x double].
474/// \param __b
475///    A 128-bit vector of [2 x double].
476/// \returns A 128-bit vector containing the comparison results.
477static __inline__ __m128d __DEFAULT_FN_ATTRS
478_mm_cmple_pd(__m128d __a, __m128d __b)
479{
480  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
481}
482
483/// \brief Compares each of the corresponding double-precision values of the
484///    128-bit vectors of [2 x double] to determine if the values in the first
485///    operand are greater than those in the second operand. Each comparison
486///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
487///
488/// \headerfile <x86intrin.h>
489///
490/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
491///
492/// \param __a
493///    A 128-bit vector of [2 x double].
494/// \param __b
495///    A 128-bit vector of [2 x double].
496/// \returns A 128-bit vector containing the comparison results.
497static __inline__ __m128d __DEFAULT_FN_ATTRS
498_mm_cmpgt_pd(__m128d __a, __m128d __b)
499{
500  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
501}
502
503/// \brief Compares each of the corresponding double-precision values of the
504///    128-bit vectors of [2 x double] to determine if the values in the first
505///    operand are greater than or equal to those in the second operand. Each
506///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
507///
508/// \headerfile <x86intrin.h>
509///
510/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
511///
512/// \param __a
513///    A 128-bit vector of [2 x double].
514/// \param __b
515///    A 128-bit vector of [2 x double].
516/// \returns A 128-bit vector containing the comparison results.
517static __inline__ __m128d __DEFAULT_FN_ATTRS
518_mm_cmpge_pd(__m128d __a, __m128d __b)
519{
520  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
521}
522
523/// \brief Compares each of the corresponding double-precision values of the
524///    128-bit vectors of [2 x double] to determine if the values in the first
525///    operand are ordered with respect to those in the second operand. A pair
526///    of double-precision values are "ordered" with respect to each other if
527///    neither value is a NaN. Each comparison yields 0h for false,
528///    FFFFFFFFFFFFFFFFh for true.
529///
530/// \headerfile <x86intrin.h>
531///
532/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
533///
534/// \param __a
535///    A 128-bit vector of [2 x double].
536/// \param __b
537///    A 128-bit vector of [2 x double].
538/// \returns A 128-bit vector containing the comparison results.
539static __inline__ __m128d __DEFAULT_FN_ATTRS
540_mm_cmpord_pd(__m128d __a, __m128d __b)
541{
542  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
543}
544
545/// \brief Compares each of the corresponding double-precision values of the
546///    128-bit vectors of [2 x double] to determine if the values in the first
547///    operand are unordered with respect to those in the second operand. A pair
548///    of double-precision values are "unordered" with respect to each other if
549///    one or both values are NaN. Each comparison yields 0h for false,
550///    FFFFFFFFFFFFFFFFh for true.
551///
552/// \headerfile <x86intrin.h>
553///
554/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
555///   instruction.
556///
557/// \param __a
558///    A 128-bit vector of [2 x double].
559/// \param __b
560///    A 128-bit vector of [2 x double].
561/// \returns A 128-bit vector containing the comparison results.
562static __inline__ __m128d __DEFAULT_FN_ATTRS
563_mm_cmpunord_pd(__m128d __a, __m128d __b)
564{
565  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
566}
567
568/// \brief Compares each of the corresponding double-precision values of the
569///    128-bit vectors of [2 x double] to determine if the values in the first
570///    operand are unequal to those in the second operand. Each comparison
571///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
572///
573/// \headerfile <x86intrin.h>
574///
575/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
576///
577/// \param __a
578///    A 128-bit vector of [2 x double].
579/// \param __b
580///    A 128-bit vector of [2 x double].
581/// \returns A 128-bit vector containing the comparison results.
582static __inline__ __m128d __DEFAULT_FN_ATTRS
583_mm_cmpneq_pd(__m128d __a, __m128d __b)
584{
585  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
586}
587
588/// \brief Compares each of the corresponding double-precision values of the
589///    128-bit vectors of [2 x double] to determine if the values in the first
590///    operand are not less than those in the second operand. Each comparison
591///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
592///
593/// \headerfile <x86intrin.h>
594///
595/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
596///
597/// \param __a
598///    A 128-bit vector of [2 x double].
599/// \param __b
600///    A 128-bit vector of [2 x double].
601/// \returns A 128-bit vector containing the comparison results.
602static __inline__ __m128d __DEFAULT_FN_ATTRS
603_mm_cmpnlt_pd(__m128d __a, __m128d __b)
604{
605  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
606}
607
608/// \brief Compares each of the corresponding double-precision values of the
609///    128-bit vectors of [2 x double] to determine if the values in the first
610///    operand are not less than or equal to those in the second operand. Each
611///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
612///
613/// \headerfile <x86intrin.h>
614///
615/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
616///
617/// \param __a
618///    A 128-bit vector of [2 x double].
619/// \param __b
620///    A 128-bit vector of [2 x double].
621/// \returns A 128-bit vector containing the comparison results.
622static __inline__ __m128d __DEFAULT_FN_ATTRS
623_mm_cmpnle_pd(__m128d __a, __m128d __b)
624{
625  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
626}
627
628/// \brief Compares each of the corresponding double-precision values of the
629///    128-bit vectors of [2 x double] to determine if the values in the first
630///    operand are not greater than those in the second operand. Each
631///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
632///
633/// \headerfile <x86intrin.h>
634///
635/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
636///
637/// \param __a
638///    A 128-bit vector of [2 x double].
639/// \param __b
640///    A 128-bit vector of [2 x double].
641/// \returns A 128-bit vector containing the comparison results.
642static __inline__ __m128d __DEFAULT_FN_ATTRS
643_mm_cmpngt_pd(__m128d __a, __m128d __b)
644{
645  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
646}
647
648/// \brief Compares each of the corresponding double-precision values of the
649///    128-bit vectors of [2 x double] to determine if the values in the first
650///    operand are not greater than or equal to those in the second operand.
651///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
652///
653/// \headerfile <x86intrin.h>
654///
655/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
656///
657/// \param __a
658///    A 128-bit vector of [2 x double].
659/// \param __b
660///    A 128-bit vector of [2 x double].
661/// \returns A 128-bit vector containing the comparison results.
662static __inline__ __m128d __DEFAULT_FN_ATTRS
663_mm_cmpnge_pd(__m128d __a, __m128d __b)
664{
665  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
666}
667
668/// \brief Compares the lower double-precision floating-point values in each of
669///    the two 128-bit floating-point vectors of [2 x double] for equality. The
670///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
671///
672/// \headerfile <x86intrin.h>
673///
674/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
675///
676/// \param __a
677///    A 128-bit vector of [2 x double]. The lower double-precision value is
678///    compared to the lower double-precision value of \a __b.
679/// \param __b
680///    A 128-bit vector of [2 x double]. The lower double-precision value is
681///    compared to the lower double-precision value of \a __a.
682/// \returns A 128-bit vector. The lower 64 bits contains the comparison
683///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
684static __inline__ __m128d __DEFAULT_FN_ATTRS
685_mm_cmpeq_sd(__m128d __a, __m128d __b)
686{
687  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
688}
689
690/// \brief Compares the lower double-precision floating-point values in each of
691///    the two 128-bit floating-point vectors of [2 x double] to determine if
692///    the value in the first parameter is less than the corresponding value in
693///    the second parameter. The comparison yields 0h for false,
694///    FFFFFFFFFFFFFFFFh for true.
695///
696/// \headerfile <x86intrin.h>
697///
698/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
699///
700/// \param __a
701///    A 128-bit vector of [2 x double]. The lower double-precision value is
702///    compared to the lower double-precision value of \a __b.
703/// \param __b
704///    A 128-bit vector of [2 x double]. The lower double-precision value is
705///    compared to the lower double-precision value of \a __a.
706/// \returns A 128-bit vector. The lower 64 bits contains the comparison
707///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
708static __inline__ __m128d __DEFAULT_FN_ATTRS
709_mm_cmplt_sd(__m128d __a, __m128d __b)
710{
711  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
712}
713
714/// \brief Compares the lower double-precision floating-point values in each of
715///    the two 128-bit floating-point vectors of [2 x double] to determine if
716///    the value in the first parameter is less than or equal to the
717///    corresponding value in the second parameter. The comparison yields 0h for
718///    false, FFFFFFFFFFFFFFFFh for true.
719///
720/// \headerfile <x86intrin.h>
721///
722/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
723///
724/// \param __a
725///    A 128-bit vector of [2 x double]. The lower double-precision value is
726///    compared to the lower double-precision value of \a __b.
727/// \param __b
728///    A 128-bit vector of [2 x double]. The lower double-precision value is
729///    compared to the lower double-precision value of \a __a.
730/// \returns A 128-bit vector. The lower 64 bits contains the comparison
731///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
732static __inline__ __m128d __DEFAULT_FN_ATTRS
733_mm_cmple_sd(__m128d __a, __m128d __b)
734{
735  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
736}
737
738/// \brief Compares the lower double-precision floating-point values in each of
739///    the two 128-bit floating-point vectors of [2 x double] to determine if
740///    the value in the first parameter is greater than the corresponding value
741///    in the second parameter. The comparison yields 0h for false,
742///    FFFFFFFFFFFFFFFFh for true.
743///
744/// \headerfile <x86intrin.h>
745///
746/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
747///
748/// \param __a
749///     A 128-bit vector of [2 x double]. The lower double-precision value is
750///     compared to the lower double-precision value of \a __b.
751/// \param __b
752///     A 128-bit vector of [2 x double]. The lower double-precision value is
753///     compared to the lower double-precision value of \a __a.
754/// \returns A 128-bit vector. The lower 64 bits contains the comparison
755///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
756static __inline__ __m128d __DEFAULT_FN_ATTRS
757_mm_cmpgt_sd(__m128d __a, __m128d __b)
758{
759  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
760  return (__m128d) { __c[0], __a[1] };
761}
762
763/// \brief Compares the lower double-precision floating-point values in each of
764///    the two 128-bit floating-point vectors of [2 x double] to determine if
765///    the value in the first parameter is greater than or equal to the
766///    corresponding value in the second parameter. The comparison yields 0h for
767///    false, FFFFFFFFFFFFFFFFh for true.
768///
769/// \headerfile <x86intrin.h>
770///
771/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
772///
773/// \param __a
774///    A 128-bit vector of [2 x double]. The lower double-precision value is
775///    compared to the lower double-precision value of \a __b.
776/// \param __b
777///    A 128-bit vector of [2 x double]. The lower double-precision value is
778///    compared to the lower double-precision value of \a __a.
779/// \returns A 128-bit vector. The lower 64 bits contains the comparison
780///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
781static __inline__ __m128d __DEFAULT_FN_ATTRS
782_mm_cmpge_sd(__m128d __a, __m128d __b)
783{
784  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
785  return (__m128d) { __c[0], __a[1] };
786}
787
788/// \brief Compares the lower double-precision floating-point values in each of
789///    the two 128-bit floating-point vectors of [2 x double] to determine if
790///    the value in the first parameter is "ordered" with respect to the
791///    corresponding value in the second parameter. The comparison yields 0h for
792///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
793///    "ordered" with respect to each other if neither value is a NaN.
794///
795/// \headerfile <x86intrin.h>
796///
797/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
798///
799/// \param __a
800///    A 128-bit vector of [2 x double]. The lower double-precision value is
801///    compared to the lower double-precision value of \a __b.
802/// \param __b
803///    A 128-bit vector of [2 x double]. The lower double-precision value is
804///    compared to the lower double-precision value of \a __a.
805/// \returns A 128-bit vector. The lower 64 bits contains the comparison
806///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
807static __inline__ __m128d __DEFAULT_FN_ATTRS
808_mm_cmpord_sd(__m128d __a, __m128d __b)
809{
810  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
811}
812
813/// \brief Compares the lower double-precision floating-point values in each of
814///    the two 128-bit floating-point vectors of [2 x double] to determine if
815///    the value in the first parameter is "unordered" with respect to the
816///    corresponding value in the second parameter. The comparison yields 0h
817///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
818///    are "unordered" with respect to each other if one or both values are NaN.
819///
820/// \headerfile <x86intrin.h>
821///
822/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
823///   instruction.
824///
825/// \param __a
826///    A 128-bit vector of [2 x double]. The lower double-precision value is
827///    compared to the lower double-precision value of \a __b.
828/// \param __b
829///    A 128-bit vector of [2 x double]. The lower double-precision value is
830///    compared to the lower double-precision value of \a __a.
831/// \returns A 128-bit vector. The lower 64 bits contains the comparison
832///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
833static __inline__ __m128d __DEFAULT_FN_ATTRS
834_mm_cmpunord_sd(__m128d __a, __m128d __b)
835{
836  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
837}
838
839/// \brief Compares the lower double-precision floating-point values in each of
840///    the two 128-bit floating-point vectors of [2 x double] to determine if
841///    the value in the first parameter is unequal to the corresponding value in
842///    the second parameter. The comparison yields 0h for false,
843///    FFFFFFFFFFFFFFFFh for true.
844///
845/// \headerfile <x86intrin.h>
846///
847/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
848///
849/// \param __a
850///    A 128-bit vector of [2 x double]. The lower double-precision value is
851///    compared to the lower double-precision value of \a __b.
852/// \param __b
853///    A 128-bit vector of [2 x double]. The lower double-precision value is
854///    compared to the lower double-precision value of \a __a.
855/// \returns A 128-bit vector. The lower 64 bits contains the comparison
856///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
857static __inline__ __m128d __DEFAULT_FN_ATTRS
858_mm_cmpneq_sd(__m128d __a, __m128d __b)
859{
860  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
861}
862
863/// \brief Compares the lower double-precision floating-point values in each of
864///    the two 128-bit floating-point vectors of [2 x double] to determine if
865///    the value in the first parameter is not less than the corresponding
866///    value in the second parameter. The comparison yields 0h for false,
867///    FFFFFFFFFFFFFFFFh for true.
868///
869/// \headerfile <x86intrin.h>
870///
871/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
872///
873/// \param __a
874///    A 128-bit vector of [2 x double]. The lower double-precision value is
875///    compared to the lower double-precision value of \a __b.
876/// \param __b
877///    A 128-bit vector of [2 x double]. The lower double-precision value is
878///    compared to the lower double-precision value of \a __a.
879/// \returns A 128-bit vector. The lower 64 bits contains the comparison
880///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
881static __inline__ __m128d __DEFAULT_FN_ATTRS
882_mm_cmpnlt_sd(__m128d __a, __m128d __b)
883{
884  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
885}
886
887/// \brief Compares the lower double-precision floating-point values in each of
888///    the two 128-bit floating-point vectors of [2 x double] to determine if
889///    the value in the first parameter is not less than or equal to the
890///    corresponding value in the second parameter. The comparison yields 0h
891///    for false, FFFFFFFFFFFFFFFFh for true.
892///
893/// \headerfile <x86intrin.h>
894///
895/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
896///
897/// \param __a
898///    A 128-bit vector of [2 x double]. The lower double-precision value is
899///    compared to the lower double-precision value of \a __b.
900/// \param __b
901///    A 128-bit vector of [2 x double]. The lower double-precision value is
902///    compared to the lower double-precision value of \a __a.
903/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
904///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
905static __inline__ __m128d __DEFAULT_FN_ATTRS
906_mm_cmpnle_sd(__m128d __a, __m128d __b)
907{
908  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
909}
910
911/// \brief Compares the lower double-precision floating-point values in each of
912///    the two 128-bit floating-point vectors of [2 x double] to determine if
913///    the value in the first parameter is not greater than the corresponding
914///    value in the second parameter. The comparison yields 0h for false,
915///    FFFFFFFFFFFFFFFFh for true.
916///
917/// \headerfile <x86intrin.h>
918///
919/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
920///
921/// \param __a
922///    A 128-bit vector of [2 x double]. The lower double-precision value is
923///    compared to the lower double-precision value of \a __b.
924/// \param __b
925///    A 128-bit vector of [2 x double]. The lower double-precision value is
926///    compared to the lower double-precision value of \a __a.
927/// \returns A 128-bit vector. The lower 64 bits contains the comparison
928///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
929static __inline__ __m128d __DEFAULT_FN_ATTRS
930_mm_cmpngt_sd(__m128d __a, __m128d __b)
931{
932  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
933  return (__m128d) { __c[0], __a[1] };
934}
935
936/// \brief Compares the lower double-precision floating-point values in each of
937///    the two 128-bit floating-point vectors of [2 x double] to determine if
938///    the value in the first parameter is not greater than or equal to the
939///    corresponding value in the second parameter. The comparison yields 0h
940///    for false, FFFFFFFFFFFFFFFFh for true.
941///
942/// \headerfile <x86intrin.h>
943///
944/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
945///
946/// \param __a
947///    A 128-bit vector of [2 x double]. The lower double-precision value is
948///    compared to the lower double-precision value of \a __b.
949/// \param __b
950///    A 128-bit vector of [2 x double]. The lower double-precision value is
951///    compared to the lower double-precision value of \a __a.
952/// \returns A 128-bit vector. The lower 64 bits contains the comparison
953///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
954static __inline__ __m128d __DEFAULT_FN_ATTRS
955_mm_cmpnge_sd(__m128d __a, __m128d __b)
956{
957  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
958  return (__m128d) { __c[0], __a[1] };
959}
960
961/// \brief Compares the lower double-precision floating-point values in each of
962///    the two 128-bit floating-point vectors of [2 x double] for equality. The
963///    comparison yields 0 for false, 1 for true.
964///
965/// \headerfile <x86intrin.h>
966///
967/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
968///
969/// \param __a
970///    A 128-bit vector of [2 x double]. The lower double-precision value is
971///    compared to the lower double-precision value of \a __b.
972/// \param __b
973///    A 128-bit vector of [2 x double]. The lower double-precision value is
974///    compared to the lower double-precision value of \a __a.
975/// \returns An integer containing the comparison results.
976static __inline__ int __DEFAULT_FN_ATTRS
977_mm_comieq_sd(__m128d __a, __m128d __b)
978{
979  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
980}
981
982/// \brief Compares the lower double-precision floating-point values in each of
983///    the two 128-bit floating-point vectors of [2 x double] to determine if
984///    the value in the first parameter is less than the corresponding value in
985///    the second parameter. The comparison yields 0 for false, 1 for true.
986///
987/// \headerfile <x86intrin.h>
988///
989/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
990///
991/// \param __a
992///    A 128-bit vector of [2 x double]. The lower double-precision value is
993///    compared to the lower double-precision value of \a __b.
994/// \param __b
995///    A 128-bit vector of [2 x double]. The lower double-precision value is
996///    compared to the lower double-precision value of \a __a.
997/// \returns An integer containing the comparison results.
998static __inline__ int __DEFAULT_FN_ATTRS
999_mm_comilt_sd(__m128d __a, __m128d __b)
1000{
1001  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1002}
1003
1004/// \brief Compares the lower double-precision floating-point values in each of
1005///    the two 128-bit floating-point vectors of [2 x double] to determine if
1006///    the value in the first parameter is less than or equal to the
1007///    corresponding value in the second parameter. The comparison yields 0 for
1008///    false, 1 for true.
1009///
1010/// \headerfile <x86intrin.h>
1011///
1012/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1013///
1014/// \param __a
1015///    A 128-bit vector of [2 x double]. The lower double-precision value is
1016///    compared to the lower double-precision value of \a __b.
1017/// \param __b
1018///     A 128-bit vector of [2 x double]. The lower double-precision value is
1019///     compared to the lower double-precision value of \a __a.
1020/// \returns An integer containing the comparison results.
1021static __inline__ int __DEFAULT_FN_ATTRS
1022_mm_comile_sd(__m128d __a, __m128d __b)
1023{
1024  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1025}
1026
1027/// \brief Compares the lower double-precision floating-point values in each of
1028///    the two 128-bit floating-point vectors of [2 x double] to determine if
1029///    the value in the first parameter is greater than the corresponding value
1030///    in the second parameter. The comparison yields 0 for false, 1 for true.
1031///
1032/// \headerfile <x86intrin.h>
1033///
1034/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1035///
1036/// \param __a
1037///    A 128-bit vector of [2 x double]. The lower double-precision value is
1038///    compared to the lower double-precision value of \a __b.
1039/// \param __b
1040///    A 128-bit vector of [2 x double]. The lower double-precision value is
1041///    compared to the lower double-precision value of \a __a.
1042/// \returns An integer containing the comparison results.
1043static __inline__ int __DEFAULT_FN_ATTRS
1044_mm_comigt_sd(__m128d __a, __m128d __b)
1045{
1046  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1047}
1048
1049/// \brief Compares the lower double-precision floating-point values in each of
1050///    the two 128-bit floating-point vectors of [2 x double] to determine if
1051///    the value in the first parameter is greater than or equal to the
1052///    corresponding value in the second parameter. The comparison yields 0 for
1053///    false, 1 for true.
1054///
1055/// \headerfile <x86intrin.h>
1056///
1057/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058///
1059/// \param __a
1060///    A 128-bit vector of [2 x double]. The lower double-precision value is
1061///    compared to the lower double-precision value of \a __b.
1062/// \param __b
1063///    A 128-bit vector of [2 x double]. The lower double-precision value is
1064///    compared to the lower double-precision value of \a __a.
1065/// \returns An integer containing the comparison results.
1066static __inline__ int __DEFAULT_FN_ATTRS
1067_mm_comige_sd(__m128d __a, __m128d __b)
1068{
1069  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070}
1071
1072/// \brief Compares the lower double-precision floating-point values in each of
1073///    the two 128-bit floating-point vectors of [2 x double] to determine if
1074///    the value in the first parameter is unequal to the corresponding value in
1075///    the second parameter. The comparison yields 0 for false, 1 for true.
1076///
1077/// \headerfile <x86intrin.h>
1078///
1079/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1080///
1081/// \param __a
1082///    A 128-bit vector of [2 x double]. The lower double-precision value is
1083///    compared to the lower double-precision value of \a __b.
1084/// \param __b
1085///    A 128-bit vector of [2 x double]. The lower double-precision value is
1086///    compared to the lower double-precision value of \a __a.
1087/// \returns An integer containing the comparison results.
1088static __inline__ int __DEFAULT_FN_ATTRS
1089_mm_comineq_sd(__m128d __a, __m128d __b)
1090{
1091  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1092}
1093
1094/// \brief Compares the lower double-precision floating-point values in each of
1095///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1096///    comparison yields 0 for false, 1 for true. If either of the two lower
1097///    double-precision values is NaN, 1 is returned.
1098///
1099/// \headerfile <x86intrin.h>
1100///
1101/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1102///
1103/// \param __a
1104///    A 128-bit vector of [2 x double]. The lower double-precision value is
1105///    compared to the lower double-precision value of \a __b.
1106/// \param __b
1107///    A 128-bit vector of [2 x double]. The lower double-precision value is
1108///    compared to the lower double-precision value of \a __a.
1109/// \returns An integer containing the comparison results. If either of the two
1110///    lower double-precision values is NaN, 1 is returned.
1111static __inline__ int __DEFAULT_FN_ATTRS
1112_mm_ucomieq_sd(__m128d __a, __m128d __b)
1113{
1114  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1115}
1116
1117/// \brief Compares the lower double-precision floating-point values in each of
1118///    the two 128-bit floating-point vectors of [2 x double] to determine if
1119///    the value in the first parameter is less than the corresponding value in
1120///    the second parameter. The comparison yields 0 for false, 1 for true. If
1121///    either of the two lower double-precision values is NaN, 1 is returned.
1122///
1123/// \headerfile <x86intrin.h>
1124///
1125/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1126///
1127/// \param __a
1128///    A 128-bit vector of [2 x double]. The lower double-precision value is
1129///    compared to the lower double-precision value of \a __b.
1130/// \param __b
1131///    A 128-bit vector of [2 x double]. The lower double-precision value is
1132///    compared to the lower double-precision value of \a __a.
1133/// \returns An integer containing the comparison results. If either of the two
1134///    lower double-precision values is NaN, 1 is returned.
1135static __inline__ int __DEFAULT_FN_ATTRS
1136_mm_ucomilt_sd(__m128d __a, __m128d __b)
1137{
1138  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1139}
1140
1141/// \brief Compares the lower double-precision floating-point values in each of
1142///    the two 128-bit floating-point vectors of [2 x double] to determine if
1143///    the value in the first parameter is less than or equal to the
1144///    corresponding value in the second parameter. The comparison yields 0 for
1145///    false, 1 for true. If either of the two lower double-precision values is
1146///    NaN, 1 is returned.
1147///
1148/// \headerfile <x86intrin.h>
1149///
1150/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1151///
1152/// \param __a
1153///    A 128-bit vector of [2 x double]. The lower double-precision value is
1154///    compared to the lower double-precision value of \a __b.
1155/// \param __b
1156///     A 128-bit vector of [2 x double]. The lower double-precision value is
1157///     compared to the lower double-precision value of \a __a.
1158/// \returns An integer containing the comparison results. If either of the two
1159///     lower double-precision values is NaN, 1 is returned.
1160static __inline__ int __DEFAULT_FN_ATTRS
1161_mm_ucomile_sd(__m128d __a, __m128d __b)
1162{
1163  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1164}
1165
1166/// \brief Compares the lower double-precision floating-point values in each of
1167///    the two 128-bit floating-point vectors of [2 x double] to determine if
1168///    the value in the first parameter is greater than the corresponding value
1169///    in the second parameter. The comparison yields 0 for false, 1 for true.
1170///    If either of the two lower double-precision values is NaN, 0 is returned.
1171///
1172/// \headerfile <x86intrin.h>
1173///
1174/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1175///
1176/// \param __a
1177///    A 128-bit vector of [2 x double]. The lower double-precision value is
1178///    compared to the lower double-precision value of \a __b.
1179/// \param __b
1180///     A 128-bit vector of [2 x double]. The lower double-precision value is
1181///     compared to the lower double-precision value of \a __a.
1182/// \returns An integer containing the comparison results. If either of the two
1183///     lower double-precision values is NaN, 0 is returned.
1184static __inline__ int __DEFAULT_FN_ATTRS
1185_mm_ucomigt_sd(__m128d __a, __m128d __b)
1186{
1187  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1188}
1189
1190/// \brief Compares the lower double-precision floating-point values in each of
1191///    the two 128-bit floating-point vectors of [2 x double] to determine if
1192///    the value in the first parameter is greater than or equal to the
1193///    corresponding value in the second parameter. The comparison yields 0 for
1194///    false, 1 for true.  If either of the two lower double-precision values
1195///    is NaN, 0 is returned.
1196///
1197/// \headerfile <x86intrin.h>
1198///
1199/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1200///
1201/// \param __a
1202///    A 128-bit vector of [2 x double]. The lower double-precision value is
1203///    compared to the lower double-precision value of \a __b.
1204/// \param __b
1205///    A 128-bit vector of [2 x double]. The lower double-precision value is
1206///    compared to the lower double-precision value of \a __a.
1207/// \returns An integer containing the comparison results. If either of the two
1208///    lower double-precision values is NaN, 0 is returned.
1209static __inline__ int __DEFAULT_FN_ATTRS
1210_mm_ucomige_sd(__m128d __a, __m128d __b)
1211{
1212  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1213}
1214
1215/// \brief Compares the lower double-precision floating-point values in each of
1216///    the two 128-bit floating-point vectors of [2 x double] to determine if
1217///    the value in the first parameter is unequal to the corresponding value in
1218///    the second parameter. The comparison yields 0 for false, 1 for true. If
1219///    either of the two lower double-precision values is NaN, 0 is returned.
1220///
1221/// \headerfile <x86intrin.h>
1222///
1223/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1224///
1225/// \param __a
1226///    A 128-bit vector of [2 x double]. The lower double-precision value is
1227///    compared to the lower double-precision value of \a __b.
1228/// \param __b
1229///    A 128-bit vector of [2 x double]. The lower double-precision value is
1230///    compared to the lower double-precision value of \a __a.
1231/// \returns An integer containing the comparison result. If either of the two
1232///    lower double-precision values is NaN, 0 is returned.
1233static __inline__ int __DEFAULT_FN_ATTRS
1234_mm_ucomineq_sd(__m128d __a, __m128d __b)
1235{
1236  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1237}
1238
1239/// \brief Converts the two double-precision floating-point elements of a
1240///    128-bit vector of [2 x double] into two single-precision floating-point
1241///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1242///    The upper 64 bits of the result vector are set to zero.
1243///
1244/// \headerfile <x86intrin.h>
1245///
1246/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1247///
1248/// \param __a
1249///    A 128-bit vector of [2 x double].
1250/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1251///    converted values. The upper 64 bits are set to zero.
1252static __inline__ __m128 __DEFAULT_FN_ATTRS
1253_mm_cvtpd_ps(__m128d __a)
1254{
1255  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1256}
1257
1258/// \brief Converts the lower two single-precision floating-point elements of a
1259///    128-bit vector of [4 x float] into two double-precision floating-point
1260///    values, returned in a 128-bit vector of [2 x double]. The upper two
1261///    elements of the input vector are unused.
1262///
1263/// \headerfile <x86intrin.h>
1264///
1265/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1266///
1267/// \param __a
1268///    A 128-bit vector of [4 x float]. The lower two single-precision
1269///    floating-point elements are converted to double-precision values. The
1270///    upper two elements are unused.
1271/// \returns A 128-bit vector of [2 x double] containing the converted values.
1272static __inline__ __m128d __DEFAULT_FN_ATTRS
1273_mm_cvtps_pd(__m128 __a)
1274{
1275  return (__m128d) __builtin_convertvector(
1276      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1277}
1278
1279/// \brief Converts the lower two integer elements of a 128-bit vector of
1280///    [4 x i32] into two double-precision floating-point values, returned in a
1281///    128-bit vector of [2 x double]. The upper two elements of the input
1282///    vector are unused.
1283///
1284/// \headerfile <x86intrin.h>
1285///
1286/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1287///
1288/// \param __a
1289///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1290///    converted to double-precision values. The upper two elements are unused.
1291/// \returns A 128-bit vector of [2 x double] containing the converted values.
1292static __inline__ __m128d __DEFAULT_FN_ATTRS
1293_mm_cvtepi32_pd(__m128i __a)
1294{
1295  return (__m128d) __builtin_convertvector(
1296      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1297}
1298
1299/// \brief Converts the two double-precision floating-point elements of a
1300///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1301///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1302///    64 bits of the result vector are set to zero.
1303///
1304/// \headerfile <x86intrin.h>
1305///
1306/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1307///
1308/// \param __a
1309///    A 128-bit vector of [2 x double].
1310/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1311///    converted values. The upper 64 bits are set to zero.
1312static __inline__ __m128i __DEFAULT_FN_ATTRS
1313_mm_cvtpd_epi32(__m128d __a)
1314{
1315  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1316}
1317
1318/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
1319///    into a 32-bit signed integer value.
1320///
1321/// \headerfile <x86intrin.h>
1322///
1323/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1324///
1325/// \param __a
1326///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1327///    conversion.
1328/// \returns A 32-bit signed integer containing the converted value.
1329static __inline__ int __DEFAULT_FN_ATTRS
1330_mm_cvtsd_si32(__m128d __a)
1331{
1332  return __builtin_ia32_cvtsd2si((__v2df)__a);
1333}
1334
1335/// \brief Converts the lower double-precision floating-point element of a
1336///    128-bit vector of [2 x double], in the second parameter, into a
1337///    single-precision floating-point value, returned in the lower 32 bits of a
1338///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1339///    copied from the upper 96 bits of the first parameter.
1340///
1341/// \headerfile <x86intrin.h>
1342///
1343/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1344///
1345/// \param __a
1346///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1347///    copied to the upper 96 bits of the result.
1348/// \param __b
1349///    A 128-bit vector of [2 x double]. The lower double-precision
1350///    floating-point element is used in the conversion.
1351/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1352///    converted value from the second parameter. The upper 96 bits are copied
1353///    from the upper 96 bits of the first parameter.
1354static __inline__ __m128 __DEFAULT_FN_ATTRS
1355_mm_cvtsd_ss(__m128 __a, __m128d __b)
1356{
1357  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1358}
1359
1360/// \brief Converts a 32-bit signed integer value, in the second parameter, into
1361///    a double-precision floating-point value, returned in the lower 64 bits of
1362///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1363///    are copied from the upper 64 bits of the first parameter.
1364///
1365/// \headerfile <x86intrin.h>
1366///
1367/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1368///
1369/// \param __a
1370///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1371///    copied to the upper 64 bits of the result.
1372/// \param __b
1373///    A 32-bit signed integer containing the value to be converted.
1374/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1375///    converted value from the second parameter. The upper 64 bits are copied
1376///    from the upper 64 bits of the first parameter.
1377static __inline__ __m128d __DEFAULT_FN_ATTRS
1378_mm_cvtsi32_sd(__m128d __a, int __b)
1379{
1380  __a[0] = __b;
1381  return __a;
1382}
1383
1384/// \brief Converts the lower single-precision floating-point element of a
1385///    128-bit vector of [4 x float], in the second parameter, into a
1386///    double-precision floating-point value, returned in the lower 64 bits of
1387///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1388///    are copied from the upper 64 bits of the first parameter.
1389///
1390/// \headerfile <x86intrin.h>
1391///
1392/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1393///
1394/// \param __a
1395///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1396///    copied to the upper 64 bits of the result.
1397/// \param __b
1398///    A 128-bit vector of [4 x float]. The lower single-precision
1399///    floating-point element is used in the conversion.
1400/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1401///    converted value from the second parameter. The upper 64 bits are copied
1402///    from the upper 64 bits of the first parameter.
1403static __inline__ __m128d __DEFAULT_FN_ATTRS
1404_mm_cvtss_sd(__m128d __a, __m128 __b)
1405{
1406  __a[0] = __b[0];
1407  return __a;
1408}
1409
1410/// \brief Converts the two double-precision floating-point elements of a
1411///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1412///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
1413///    result of either conversion is inexact, the result is truncated (rounded
1414///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
1415///    of the result vector are set to zero.
1416///
1417/// \headerfile <x86intrin.h>
1418///
1419/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1420///   instruction.
1421///
1422/// \param __a
1423///    A 128-bit vector of [2 x double].
1424/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1425///    converted values. The upper 64 bits are set to zero.
1426static __inline__ __m128i __DEFAULT_FN_ATTRS
1427_mm_cvttpd_epi32(__m128d __a)
1428{
1429  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1430}
1431
1432/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
1433///    signed integer value, truncating the result when it is inexact.
1434///
1435/// \headerfile <x86intrin.h>
1436///
1437/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1438///   instruction.
1439///
1440/// \param __a
1441///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1442///    conversion.
1443/// \returns A 32-bit signed integer containing the converted value.
1444static __inline__ int __DEFAULT_FN_ATTRS
1445_mm_cvttsd_si32(__m128d __a)
1446{
1447  return __builtin_ia32_cvttsd2si((__v2df)__a);
1448}
1449
1450/// \brief Converts the two double-precision floating-point elements of a
1451///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1452///    returned in a 64-bit vector of [2 x i32].
1453///
1454/// \headerfile <x86intrin.h>
1455///
1456/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1457///
1458/// \param __a
1459///    A 128-bit vector of [2 x double].
1460/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1461static __inline__ __m64 __DEFAULT_FN_ATTRS
1462_mm_cvtpd_pi32(__m128d __a)
1463{
1464  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1465}
1466
1467/// \brief Converts the two double-precision floating-point elements of a
1468///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1469///    returned in a 64-bit vector of [2 x i32]. If the result of either
1470///    conversion is inexact, the result is truncated (rounded towards zero)
1471///    regardless of the current MXCSR setting.
1472///
1473/// \headerfile <x86intrin.h>
1474///
1475/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1476///
1477/// \param __a
1478///    A 128-bit vector of [2 x double].
1479/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1480static __inline__ __m64 __DEFAULT_FN_ATTRS
1481_mm_cvttpd_pi32(__m128d __a)
1482{
1483  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1484}
1485
1486/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
1487///    [2 x i32] into two double-precision floating-point values, returned in a
1488///    128-bit vector of [2 x double].
1489///
1490/// \headerfile <x86intrin.h>
1491///
1492/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1493///
1494/// \param __a
1495///    A 64-bit vector of [2 x i32].
1496/// \returns A 128-bit vector of [2 x double] containing the converted values.
1497static __inline__ __m128d __DEFAULT_FN_ATTRS
1498_mm_cvtpi32_pd(__m64 __a)
1499{
1500  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1501}
1502
1503/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
1504///    a double-precision floating-point value.
1505///
1506/// \headerfile <x86intrin.h>
1507///
1508/// This intrinsic has no corresponding instruction.
1509///
1510/// \param __a
1511///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1512/// \returns A double-precision floating-point value copied from the lower 64
1513///    bits of \a __a.
1514static __inline__ double __DEFAULT_FN_ATTRS
1515_mm_cvtsd_f64(__m128d __a)
1516{
1517  return __a[0];
1518}
1519
1520/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
1521///    memory location.
1522///
1523/// \headerfile <x86intrin.h>
1524///
1525/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1526///
1527/// \param __dp
1528///    A pointer to a 128-bit memory location. The address of the memory
1529///    location has to be 16-byte aligned.
1530/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1531static __inline__ __m128d __DEFAULT_FN_ATTRS
1532_mm_load_pd(double const *__dp)
1533{
1534  return *(__m128d*)__dp;
1535}
1536
1537/// \brief Loads a double-precision floating-point value from a specified memory
1538///    location and duplicates it to both vector elements of a 128-bit vector of
1539///    [2 x double].
1540///
1541/// \headerfile <x86intrin.h>
1542///
1543/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1544///
1545/// \param __dp
1546///    A pointer to a memory location containing a double-precision value.
1547/// \returns A 128-bit vector of [2 x double] containing the loaded and
1548///    duplicated values.
1549static __inline__ __m128d __DEFAULT_FN_ATTRS
1550_mm_load1_pd(double const *__dp)
1551{
1552  struct __mm_load1_pd_struct {
1553    double __u;
1554  } __attribute__((__packed__, __may_alias__));
1555  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1556  return (__m128d){ __u, __u };
1557}
1558
1559#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
1560
1561/// \brief Loads two double-precision values, in reverse order, from an aligned
1562///    memory location into a 128-bit vector of [2 x double].
1563///
1564/// \headerfile <x86intrin.h>
1565///
1566/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1567/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1568/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1569///
1570/// \param __dp
1571///    A 16-byte aligned pointer to an array of double-precision values to be
1572///    loaded in reverse order.
1573/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1574///    values.
1575static __inline__ __m128d __DEFAULT_FN_ATTRS
1576_mm_loadr_pd(double const *__dp)
1577{
1578  __m128d __u = *(__m128d*)__dp;
1579  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1580}
1581
1582/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
1583///    unaligned memory location.
1584///
1585/// \headerfile <x86intrin.h>
1586///
1587/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1588///
1589/// \param __dp
1590///    A pointer to a 128-bit memory location. The address of the memory
1591///    location does not have to be aligned.
1592/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1593static __inline__ __m128d __DEFAULT_FN_ATTRS
1594_mm_loadu_pd(double const *__dp)
1595{
1596  struct __loadu_pd {
1597    __m128d __v;
1598  } __attribute__((__packed__, __may_alias__));
1599  return ((struct __loadu_pd*)__dp)->__v;
1600}
1601
1602/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
1603///    vector and clears the upper element.
1604///
1605/// \headerfile <x86intrin.h>
1606///
1607/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1608///
1609/// \param __dp
1610///    A pointer to a 64-bit memory location. The address of the memory
1611///    location does not have to be aligned.
1612/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1613static __inline__ __m128i __DEFAULT_FN_ATTRS
1614_mm_loadu_si64(void const *__a)
1615{
1616  struct __loadu_si64 {
1617    long long __v;
1618  } __attribute__((__packed__, __may_alias__));
1619  long long __u = ((struct __loadu_si64*)__a)->__v;
1620  return (__m128i){__u, 0L};
1621}
1622
1623/// \brief Loads a 64-bit double-precision value to the low element of a
1624///    128-bit integer vector and clears the upper element.
1625///
1626/// \headerfile <x86intrin.h>
1627///
1628/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1629///
1630/// \param __dp
1631///    An pointer to a memory location containing a double-precision value.
1632///    The address of the memory location does not have to be aligned.
1633/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1634static __inline__ __m128d __DEFAULT_FN_ATTRS
1635_mm_load_sd(double const *__dp)
1636{
1637  struct __mm_load_sd_struct {
1638    double __u;
1639  } __attribute__((__packed__, __may_alias__));
1640  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1641  return (__m128d){ __u, 0 };
1642}
1643
1644/// \brief Loads a double-precision value into the high-order bits of a 128-bit
1645///    vector of [2 x double]. The low-order bits are copied from the low-order
1646///    bits of the first operand.
1647///
1648/// \headerfile <x86intrin.h>
1649///
1650/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1651///
1652/// \param __a
1653///    A 128-bit vector of [2 x double]. \n
1654///    Bits [63:0] are written to bits [63:0] of the result.
1655/// \param __dp
1656///    A pointer to a 64-bit memory location containing a double-precision
1657///    floating-point value that is loaded. The loaded value is written to bits
1658///    [127:64] of the result. The address of the memory location does not have
1659///    to be aligned.
1660/// \returns A 128-bit vector of [2 x double] containing the moved values.
1661static __inline__ __m128d __DEFAULT_FN_ATTRS
1662_mm_loadh_pd(__m128d __a, double const *__dp)
1663{
1664  struct __mm_loadh_pd_struct {
1665    double __u;
1666  } __attribute__((__packed__, __may_alias__));
1667  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1668  return (__m128d){ __a[0], __u };
1669}
1670
1671/// \brief Loads a double-precision value into the low-order bits of a 128-bit
1672///    vector of [2 x double]. The high-order bits are copied from the
1673///    high-order bits of the first operand.
1674///
1675/// \headerfile <x86intrin.h>
1676///
1677/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1678///
1679/// \param __a
1680///    A 128-bit vector of [2 x double]. \n
1681///    Bits [127:64] are written to bits [127:64] of the result.
1682/// \param __dp
1683///    A pointer to a 64-bit memory location containing a double-precision
1684///    floating-point value that is loaded. The loaded value is written to bits
1685///    [63:0] of the result. The address of the memory location does not have to
1686///    be aligned.
1687/// \returns A 128-bit vector of [2 x double] containing the moved values.
1688static __inline__ __m128d __DEFAULT_FN_ATTRS
1689_mm_loadl_pd(__m128d __a, double const *__dp)
1690{
1691  struct __mm_loadl_pd_struct {
1692    double __u;
1693  } __attribute__((__packed__, __may_alias__));
1694  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1695  return (__m128d){ __u, __a[1] };
1696}
1697
1698/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
1699///    unspecified content. This could be used as an argument to another
1700///    intrinsic function where the argument is required but the value is not
1701///    actually used.
1702///
1703/// \headerfile <x86intrin.h>
1704///
1705/// This intrinsic has no corresponding instruction.
1706///
1707/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708///    content.
1709static __inline__ __m128d __DEFAULT_FN_ATTRS
1710_mm_undefined_pd(void)
1711{
1712  return (__m128d)__builtin_ia32_undef128();
1713}
1714
1715/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1716///    64 bits of the vector are initialized with the specified double-precision
1717///    floating-point value. The upper 64 bits are set to zero.
1718///
1719/// \headerfile <x86intrin.h>
1720///
1721/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1722///
1723/// \param __w
1724///    A double-precision floating-point value used to initialize the lower 64
1725///    bits of the result.
1726/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1727///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1728///    set to zero.
1729static __inline__ __m128d __DEFAULT_FN_ATTRS
1730_mm_set_sd(double __w)
1731{
1732  return (__m128d){ __w, 0 };
1733}
1734
1735/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
1736///    of the two double-precision floating-point vector elements set to the
1737///    specified double-precision floating-point value.
1738///
1739/// \headerfile <x86intrin.h>
1740///
1741/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1742///
1743/// \param __w
1744///    A double-precision floating-point value used to initialize each vector
1745///    element of the result.
1746/// \returns An initialized 128-bit floating-point vector of [2 x double].
1747static __inline__ __m128d __DEFAULT_FN_ATTRS
1748_mm_set1_pd(double __w)
1749{
1750  return (__m128d){ __w, __w };
1751}
1752
1753/// \brief Constructs a 128-bit floating-point vector of [2 x double]
1754///    initialized with the specified double-precision floating-point values.
1755///
1756/// \headerfile <x86intrin.h>
1757///
1758/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1759///
1760/// \param __w
1761///    A double-precision floating-point value used to initialize the upper 64
1762///    bits of the result.
1763/// \param __x
1764///    A double-precision floating-point value used to initialize the lower 64
1765///    bits of the result.
1766/// \returns An initialized 128-bit floating-point vector of [2 x double].
1767static __inline__ __m128d __DEFAULT_FN_ATTRS
1768_mm_set_pd(double __w, double __x)
1769{
1770  return (__m128d){ __x, __w };
1771}
1772
1773/// \brief Constructs a 128-bit floating-point vector of [2 x double],
1774///    initialized in reverse order with the specified double-precision
1775///    floating-point values.
1776///
1777/// \headerfile <x86intrin.h>
1778///
1779/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1780///
1781/// \param __w
1782///    A double-precision floating-point value used to initialize the lower 64
1783///    bits of the result.
1784/// \param __x
1785///    A double-precision floating-point value used to initialize the upper 64
1786///    bits of the result.
1787/// \returns An initialized 128-bit floating-point vector of [2 x double].
1788static __inline__ __m128d __DEFAULT_FN_ATTRS
1789_mm_setr_pd(double __w, double __x)
1790{
1791  return (__m128d){ __w, __x };
1792}
1793
1794/// \brief Constructs a 128-bit floating-point vector of [2 x double]
1795///    initialized to zero.
1796///
1797/// \headerfile <x86intrin.h>
1798///
1799/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1800///
1801/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1802///    all elements set to zero.
1803static __inline__ __m128d __DEFAULT_FN_ATTRS
1804_mm_setzero_pd(void)
1805{
1806  return (__m128d){ 0, 0 };
1807}
1808
1809/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1810///    64 bits are set to the lower 64 bits of the second parameter. The upper
1811///    64 bits are set to the upper 64 bits of the first parameter.
1812///
1813/// \headerfile <x86intrin.h>
1814///
1815/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1816///
1817/// \param __a
1818///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1819///    upper 64 bits of the result.
1820/// \param __b
1821///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1822///    lower 64 bits of the result.
1823/// \returns A 128-bit vector of [2 x double] containing the moved values.
1824static __inline__ __m128d __DEFAULT_FN_ATTRS
1825_mm_move_sd(__m128d __a, __m128d __b)
1826{
1827  return (__m128d){ __b[0], __a[1] };
1828}
1829
1830/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1831///    memory location.
1832///
1833/// \headerfile <x86intrin.h>
1834///
1835/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1836///
1837/// \param __dp
1838///    A pointer to a 64-bit memory location.
1839/// \param __a
1840///    A 128-bit vector of [2 x double] containing the value to be stored.
1841static __inline__ void __DEFAULT_FN_ATTRS
1842_mm_store_sd(double *__dp, __m128d __a)
1843{
1844  struct __mm_store_sd_struct {
1845    double __u;
1846  } __attribute__((__packed__, __may_alias__));
1847  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1848}
1849
1850static __inline__ void __DEFAULT_FN_ATTRS
1851_mm_store_pd(double *__dp, __m128d __a)
1852{
1853  *(__m128d*)__dp = __a;
1854}
1855
1856static __inline__ void __DEFAULT_FN_ATTRS
1857_mm_store1_pd(double *__dp, __m128d __a)
1858{
1859  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1860  _mm_store_pd(__dp, __a);
1861}
1862
1863/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
1864///    location.
1865///
1866/// \headerfile <x86intrin.h>
1867///
1868/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1869///
1870/// \param __dp
1871///    A pointer to a 128-bit memory location. The address of the memory
1872///    location has to be 16-byte aligned.
1873/// \param __a
1874///    A 128-bit vector of [2 x double] containing the values to be stored.
1875static __inline__ void __DEFAULT_FN_ATTRS
1876_mm_store_pd1(double *__dp, __m128d __a)
1877{
1878  return _mm_store1_pd(__dp, __a);
1879}
1880
1881/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
1882///    location.
1883///
1884/// \headerfile <x86intrin.h>
1885///
1886/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1887///
1888/// \param __dp
1889///    A pointer to a 128-bit memory location. The address of the memory
1890///    location does not have to be aligned.
1891/// \param __a
1892///    A 128-bit vector of [2 x double] containing the values to be stored.
1893static __inline__ void __DEFAULT_FN_ATTRS
1894_mm_storeu_pd(double *__dp, __m128d __a)
1895{
1896  struct __storeu_pd {
1897    __m128d __v;
1898  } __attribute__((__packed__, __may_alias__));
1899  ((struct __storeu_pd*)__dp)->__v = __a;
1900}
1901
1902/// \brief Stores two double-precision values, in reverse order, from a 128-bit
1903///    vector of [2 x double] to a 16-byte aligned memory location.
1904///
1905/// \headerfile <x86intrin.h>
1906///
1907/// This intrinsic corresponds to a shuffling instruction followed by a
1908/// <c> VMOVAPD / MOVAPD </c> instruction.
1909///
1910/// \param __dp
1911///    A pointer to a 16-byte aligned memory location that can store two
1912///    double-precision values.
1913/// \param __a
1914///    A 128-bit vector of [2 x double] containing the values to be reversed and
1915///    stored.
1916static __inline__ void __DEFAULT_FN_ATTRS
1917_mm_storer_pd(double *__dp, __m128d __a)
1918{
1919  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1920  *(__m128d *)__dp = __a;
1921}
1922
1923/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1924///    memory location.
1925///
1926/// \headerfile <x86intrin.h>
1927///
1928/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1929///
1930/// \param __dp
1931///    A pointer to a 64-bit memory location.
1932/// \param __a
1933///    A 128-bit vector of [2 x double] containing the value to be stored.
1934static __inline__ void __DEFAULT_FN_ATTRS
1935_mm_storeh_pd(double *__dp, __m128d __a)
1936{
1937  struct __mm_storeh_pd_struct {
1938    double __u;
1939  } __attribute__((__packed__, __may_alias__));
1940  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
1941}
1942
1943/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1944///    memory location.
1945///
1946/// \headerfile <x86intrin.h>
1947///
1948/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1949///
1950/// \param __dp
1951///    A pointer to a 64-bit memory location.
1952/// \param __a
1953///    A 128-bit vector of [2 x double] containing the value to be stored.
1954static __inline__ void __DEFAULT_FN_ATTRS
1955_mm_storel_pd(double *__dp, __m128d __a)
1956{
1957  struct __mm_storeh_pd_struct {
1958    double __u;
1959  } __attribute__((__packed__, __may_alias__));
1960  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
1961}
1962
1963/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1964///    saving the lower 8 bits of each sum in the corresponding element of a
1965///    128-bit result vector of [16 x i8]. The integer elements of both
1966///    parameters can be either signed or unsigned.
1967///
1968/// \headerfile <x86intrin.h>
1969///
1970/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1971///
1972/// \param __a
1973///    A 128-bit vector of [16 x i8].
1974/// \param __b
1975///    A 128-bit vector of [16 x i8].
1976/// \returns A 128-bit vector of [16 x i8] containing the sums of both
1977///    parameters.
1978static __inline__ __m128i __DEFAULT_FN_ATTRS
1979_mm_add_epi8(__m128i __a, __m128i __b)
1980{
1981  return (__m128i)((__v16qu)__a + (__v16qu)__b);
1982}
1983
1984/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
1985///    saving the lower 16 bits of each sum in the corresponding element of a
1986///    128-bit result vector of [8 x i16]. The integer elements of both
1987///    parameters can be either signed or unsigned.
1988///
1989/// \headerfile <x86intrin.h>
1990///
1991/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
1992///
1993/// \param __a
1994///    A 128-bit vector of [8 x i16].
1995/// \param __b
1996///    A 128-bit vector of [8 x i16].
1997/// \returns A 128-bit vector of [8 x i16] containing the sums of both
1998///    parameters.
1999static __inline__ __m128i __DEFAULT_FN_ATTRS
2000_mm_add_epi16(__m128i __a, __m128i __b)
2001{
2002  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2003}
2004
2005/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2006///    saving the lower 32 bits of each sum in the corresponding element of a
2007///    128-bit result vector of [4 x i32]. The integer elements of both
2008///    parameters can be either signed or unsigned.
2009///
2010/// \headerfile <x86intrin.h>
2011///
2012/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2013///
2014/// \param __a
2015///    A 128-bit vector of [4 x i32].
2016/// \param __b
2017///    A 128-bit vector of [4 x i32].
2018/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2019///    parameters.
2020static __inline__ __m128i __DEFAULT_FN_ATTRS
2021_mm_add_epi32(__m128i __a, __m128i __b)
2022{
2023  return (__m128i)((__v4su)__a + (__v4su)__b);
2024}
2025
2026/// \brief Adds two signed or unsigned 64-bit integer values, returning the
2027///    lower 64 bits of the sum.
2028///
2029/// \headerfile <x86intrin.h>
2030///
2031/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2032///
2033/// \param __a
2034///    A 64-bit integer.
2035/// \param __b
2036///    A 64-bit integer.
2037/// \returns A 64-bit integer containing the sum of both parameters.
2038static __inline__ __m64 __DEFAULT_FN_ATTRS
2039_mm_add_si64(__m64 __a, __m64 __b)
2040{
2041  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2042}
2043
2044/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2045///    saving the lower 64 bits of each sum in the corresponding element of a
2046///    128-bit result vector of [2 x i64]. The integer elements of both
2047///    parameters can be either signed or unsigned.
2048///
2049/// \headerfile <x86intrin.h>
2050///
2051/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2052///
2053/// \param __a
2054///    A 128-bit vector of [2 x i64].
2055/// \param __b
2056///    A 128-bit vector of [2 x i64].
2057/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2058///    parameters.
2059static __inline__ __m128i __DEFAULT_FN_ATTRS
2060_mm_add_epi64(__m128i __a, __m128i __b)
2061{
2062  return (__m128i)((__v2du)__a + (__v2du)__b);
2063}
2064
2065/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2066///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2067///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
2068///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
2069///
2070/// \headerfile <x86intrin.h>
2071///
2072/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2073///
2074/// \param __a
2075///    A 128-bit signed [16 x i8] vector.
2076/// \param __b
2077///    A 128-bit signed [16 x i8] vector.
2078/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2079///    both parameters.
2080static __inline__ __m128i __DEFAULT_FN_ATTRS
2081_mm_adds_epi8(__m128i __a, __m128i __b)
2082{
2083  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2084}
2085
2086/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2087///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2088///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
2089///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
2090///    8000h.
2091///
2092/// \headerfile <x86intrin.h>
2093///
2094/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2095///
2096/// \param __a
2097///    A 128-bit signed [8 x i16] vector.
2098/// \param __b
2099///    A 128-bit signed [8 x i16] vector.
2100/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2101///    both parameters.
2102static __inline__ __m128i __DEFAULT_FN_ATTRS
2103_mm_adds_epi16(__m128i __a, __m128i __b)
2104{
2105  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2106}
2107
2108/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2109///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2110///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
2111///    are saturated to FFh. Negative sums are saturated to 00h.
2112///
2113/// \headerfile <x86intrin.h>
2114///
2115/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2116///
2117/// \param __a
2118///    A 128-bit unsigned [16 x i8] vector.
2119/// \param __b
2120///    A 128-bit unsigned [16 x i8] vector.
2121/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2122///    of both parameters.
2123static __inline__ __m128i __DEFAULT_FN_ATTRS
2124_mm_adds_epu8(__m128i __a, __m128i __b)
2125{
2126  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2127}
2128
2129/// \brief Adds, with saturation, the corresponding elements of two 128-bit
2130///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2131///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
2132///    are saturated to FFFFh. Negative sums are saturated to 0000h.
2133///
2134/// \headerfile <x86intrin.h>
2135///
2136/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2137///
2138/// \param __a
2139///    A 128-bit unsigned [8 x i16] vector.
2140/// \param __b
2141///    A 128-bit unsigned [8 x i16] vector.
2142/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2143///    of both parameters.
2144static __inline__ __m128i __DEFAULT_FN_ATTRS
2145_mm_adds_epu16(__m128i __a, __m128i __b)
2146{
2147  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2148}
2149
2150/// \brief Computes the rounded avarages of corresponding elements of two
2151///    128-bit unsigned [16 x i8] vectors, saving each result in the
2152///    corresponding element of a 128-bit result vector of [16 x i8].
2153///
2154/// \headerfile <x86intrin.h>
2155///
2156/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2157///
2158/// \param __a
2159///    A 128-bit unsigned [16 x i8] vector.
2160/// \param __b
2161///    A 128-bit unsigned [16 x i8] vector.
2162/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2163///    averages of both parameters.
2164static __inline__ __m128i __DEFAULT_FN_ATTRS
2165_mm_avg_epu8(__m128i __a, __m128i __b)
2166{
2167  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2168}
2169
2170/// \brief Computes the rounded avarages of corresponding elements of two
2171///    128-bit unsigned [8 x i16] vectors, saving each result in the
2172///    corresponding element of a 128-bit result vector of [8 x i16].
2173///
2174/// \headerfile <x86intrin.h>
2175///
2176/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2177///
2178/// \param __a
2179///    A 128-bit unsigned [8 x i16] vector.
2180/// \param __b
2181///    A 128-bit unsigned [8 x i16] vector.
2182/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2183///    averages of both parameters.
2184static __inline__ __m128i __DEFAULT_FN_ATTRS
2185_mm_avg_epu16(__m128i __a, __m128i __b)
2186{
2187  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2188}
2189
2190/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2191///    vectors, producing eight intermediate 32-bit signed integer products, and
2192///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2193///    [4 x i32] vector. For example, bits [15:0] of both parameters are
2194///    multiplied producing a 32-bit product, bits [31:16] of both parameters
2195///    are multiplied producing a 32-bit product, and the sum of those two
2196///    products becomes bits [31:0] of the result.
2197///
2198/// \headerfile <x86intrin.h>
2199///
2200/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2201///
2202/// \param __a
2203///    A 128-bit signed [8 x i16] vector.
2204/// \param __b
2205///    A 128-bit signed [8 x i16] vector.
2206/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2207///    of both parameters.
2208static __inline__ __m128i __DEFAULT_FN_ATTRS
2209_mm_madd_epi16(__m128i __a, __m128i __b)
2210{
2211  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2212}
2213
2214/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2215///    vectors, saving the greater value from each comparison in the
2216///    corresponding element of a 128-bit result vector of [8 x i16].
2217///
2218/// \headerfile <x86intrin.h>
2219///
2220/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2221///
2222/// \param __a
2223///    A 128-bit signed [8 x i16] vector.
2224/// \param __b
2225///    A 128-bit signed [8 x i16] vector.
2226/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2227///    each comparison.
2228static __inline__ __m128i __DEFAULT_FN_ATTRS
2229_mm_max_epi16(__m128i __a, __m128i __b)
2230{
2231  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2232}
2233
2234/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2235///    vectors, saving the greater value from each comparison in the
2236///    corresponding element of a 128-bit result vector of [16 x i8].
2237///
2238/// \headerfile <x86intrin.h>
2239///
2240/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2241///
2242/// \param __a
2243///    A 128-bit unsigned [16 x i8] vector.
2244/// \param __b
2245///    A 128-bit unsigned [16 x i8] vector.
2246/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2247///    each comparison.
2248static __inline__ __m128i __DEFAULT_FN_ATTRS
2249_mm_max_epu8(__m128i __a, __m128i __b)
2250{
2251  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2252}
2253
2254/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2255///    vectors, saving the smaller value from each comparison in the
2256///    corresponding element of a 128-bit result vector of [8 x i16].
2257///
2258/// \headerfile <x86intrin.h>
2259///
2260/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2261///
2262/// \param __a
2263///    A 128-bit signed [8 x i16] vector.
2264/// \param __b
2265///    A 128-bit signed [8 x i16] vector.
2266/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2267///    each comparison.
2268static __inline__ __m128i __DEFAULT_FN_ATTRS
2269_mm_min_epi16(__m128i __a, __m128i __b)
2270{
2271  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2272}
2273
2274/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2275///    vectors, saving the smaller value from each comparison in the
2276///    corresponding element of a 128-bit result vector of [16 x i8].
2277///
2278/// \headerfile <x86intrin.h>
2279///
2280/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2281///
2282/// \param __a
2283///    A 128-bit unsigned [16 x i8] vector.
2284/// \param __b
2285///    A 128-bit unsigned [16 x i8] vector.
2286/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2287///    each comparison.
2288static __inline__ __m128i __DEFAULT_FN_ATTRS
2289_mm_min_epu8(__m128i __a, __m128i __b)
2290{
2291  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2292}
2293
2294/// \brief Multiplies the corresponding elements of two signed [8 x i16]
2295///    vectors, saving the upper 16 bits of each 32-bit product in the
2296///    corresponding element of a 128-bit signed [8 x i16] result vector.
2297///
2298/// \headerfile <x86intrin.h>
2299///
2300/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2301///
2302/// \param __a
2303///    A 128-bit signed [8 x i16] vector.
2304/// \param __b
2305///    A 128-bit signed [8 x i16] vector.
2306/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2307///    each of the eight 32-bit products.
2308static __inline__ __m128i __DEFAULT_FN_ATTRS
2309_mm_mulhi_epi16(__m128i __a, __m128i __b)
2310{
2311  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2312}
2313
2314/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
2315///    vectors, saving the upper 16 bits of each 32-bit product in the
2316///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2317///
2318/// \headerfile <x86intrin.h>
2319///
2320/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2321///
2322/// \param __a
2323///    A 128-bit unsigned [8 x i16] vector.
2324/// \param __b
2325///    A 128-bit unsigned [8 x i16] vector.
2326/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2327///    of each of the eight 32-bit products.
2328static __inline__ __m128i __DEFAULT_FN_ATTRS
2329_mm_mulhi_epu16(__m128i __a, __m128i __b)
2330{
2331  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2332}
2333
2334/// \brief Multiplies the corresponding elements of two signed [8 x i16]
2335///    vectors, saving the lower 16 bits of each 32-bit product in the
2336///    corresponding element of a 128-bit signed [8 x i16] result vector.
2337///
2338/// \headerfile <x86intrin.h>
2339///
2340/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2341///
2342/// \param __a
2343///    A 128-bit signed [8 x i16] vector.
2344/// \param __b
2345///    A 128-bit signed [8 x i16] vector.
2346/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2347///    each of the eight 32-bit products.
2348static __inline__ __m128i __DEFAULT_FN_ATTRS
2349_mm_mullo_epi16(__m128i __a, __m128i __b)
2350{
2351  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2352}
2353
2354/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
2355///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2356///    product.
2357///
2358/// \headerfile <x86intrin.h>
2359///
2360/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2361///
2362/// \param __a
2363///    A 64-bit integer containing one of the source operands.
2364/// \param __b
2365///    A 64-bit integer containing one of the source operands.
2366/// \returns A 64-bit integer vector containing the product of both operands.
2367static __inline__ __m64 __DEFAULT_FN_ATTRS
2368_mm_mul_su32(__m64 __a, __m64 __b)
2369{
2370  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2371}
2372
2373/// \brief Multiplies 32-bit unsigned integer values contained in the lower
2374///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2375///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2376///
2377/// \headerfile <x86intrin.h>
2378///
2379/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2380///
2381/// \param __a
2382///    A [2 x i64] vector containing one of the source operands.
2383/// \param __b
2384///    A [2 x i64] vector containing one of the source operands.
2385/// \returns A [2 x i64] vector containing the product of both operands.
2386static __inline__ __m128i __DEFAULT_FN_ATTRS
2387_mm_mul_epu32(__m128i __a, __m128i __b)
2388{
2389  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2390}
2391
2392/// \brief Computes the absolute differences of corresponding 8-bit integer
2393///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2394///    separately sums the second 8 absolute differences. Packs these two
2395///    unsigned 16-bit integer sums into the upper and lower elements of a
2396///    [2 x i64] vector.
2397///
2398/// \headerfile <x86intrin.h>
2399///
2400/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2401///
2402/// \param __a
2403///    A 128-bit integer vector containing one of the source operands.
2404/// \param __b
2405///    A 128-bit integer vector containing one of the source operands.
2406/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2407///    differences between both operands.
2408static __inline__ __m128i __DEFAULT_FN_ATTRS
2409_mm_sad_epu8(__m128i __a, __m128i __b)
2410{
2411  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2412}
2413
2414/// \brief Subtracts the corresponding 8-bit integer values in the operands.
2415///
2416/// \headerfile <x86intrin.h>
2417///
2418/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2419///
2420/// \param __a
2421///    A 128-bit integer vector containing the minuends.
2422/// \param __b
2423///    A 128-bit integer vector containing the subtrahends.
2424/// \returns A 128-bit integer vector containing the differences of the values
2425///    in the operands.
2426static __inline__ __m128i __DEFAULT_FN_ATTRS
2427_mm_sub_epi8(__m128i __a, __m128i __b)
2428{
2429  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2430}
2431
2432/// \brief Subtracts the corresponding 16-bit integer values in the operands.
2433///
2434/// \headerfile <x86intrin.h>
2435///
2436/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2437///
2438/// \param __a
2439///    A 128-bit integer vector containing the minuends.
2440/// \param __b
2441///    A 128-bit integer vector containing the subtrahends.
2442/// \returns A 128-bit integer vector containing the differences of the values
2443///    in the operands.
2444static __inline__ __m128i __DEFAULT_FN_ATTRS
2445_mm_sub_epi16(__m128i __a, __m128i __b)
2446{
2447  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2448}
2449
2450/// \brief Subtracts the corresponding 32-bit integer values in the operands.
2451///
2452/// \headerfile <x86intrin.h>
2453///
2454/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2455///
2456/// \param __a
2457///    A 128-bit integer vector containing the minuends.
2458/// \param __b
2459///    A 128-bit integer vector containing the subtrahends.
2460/// \returns A 128-bit integer vector containing the differences of the values
2461///    in the operands.
2462static __inline__ __m128i __DEFAULT_FN_ATTRS
2463_mm_sub_epi32(__m128i __a, __m128i __b)
2464{
2465  return (__m128i)((__v4su)__a - (__v4su)__b);
2466}
2467
2468/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
2469///    difference to the corresponding bits in the destination.
2470///
2471/// \headerfile <x86intrin.h>
2472///
2473/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2474///
2475/// \param __a
2476///    A 64-bit integer vector containing the minuend.
2477/// \param __b
2478///    A 64-bit integer vector containing the subtrahend.
2479/// \returns A 64-bit integer vector containing the difference of the values in
2480///    the operands.
2481static __inline__ __m64 __DEFAULT_FN_ATTRS
2482_mm_sub_si64(__m64 __a, __m64 __b)
2483{
2484  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2485}
2486
2487/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
2488///
2489/// \headerfile <x86intrin.h>
2490///
2491/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2492///
2493/// \param __a
2494///    A 128-bit integer vector containing the minuends.
2495/// \param __b
2496///    A 128-bit integer vector containing the subtrahends.
2497/// \returns A 128-bit integer vector containing the differences of the values
2498///    in the operands.
2499static __inline__ __m128i __DEFAULT_FN_ATTRS
2500_mm_sub_epi64(__m128i __a, __m128i __b)
2501{
2502  return (__m128i)((__v2du)__a - (__v2du)__b);
2503}
2504
2505/// \brief Subtracts corresponding 8-bit signed integer values in the input and
2506///    returns the differences in the corresponding bytes in the destination.
2507///    Differences greater than 7Fh are saturated to 7Fh, and differences less
2508///    than 80h are saturated to 80h.
2509///
2510/// \headerfile <x86intrin.h>
2511///
2512/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2513///
2514/// \param __a
2515///    A 128-bit integer vector containing the minuends.
2516/// \param __b
2517///    A 128-bit integer vector containing the subtrahends.
2518/// \returns A 128-bit integer vector containing the differences of the values
2519///    in the operands.
2520static __inline__ __m128i __DEFAULT_FN_ATTRS
2521_mm_subs_epi8(__m128i __a, __m128i __b)
2522{
2523  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2524}
2525
2526/// \brief Subtracts corresponding 16-bit signed integer values in the input and
2527///    returns the differences in the corresponding bytes in the destination.
2528///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
2529///    than 8000h are saturated to 8000h.
2530///
2531/// \headerfile <x86intrin.h>
2532///
2533/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2534///
2535/// \param __a
2536///    A 128-bit integer vector containing the minuends.
2537/// \param __b
2538///    A 128-bit integer vector containing the subtrahends.
2539/// \returns A 128-bit integer vector containing the differences of the values
2540///    in the operands.
2541static __inline__ __m128i __DEFAULT_FN_ATTRS
2542_mm_subs_epi16(__m128i __a, __m128i __b)
2543{
2544  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2545}
2546
2547/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
2548///    and returns the differences in the corresponding bytes in the
2549///    destination. Differences less than 00h are saturated to 00h.
2550///
2551/// \headerfile <x86intrin.h>
2552///
2553/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2554///
2555/// \param __a
2556///    A 128-bit integer vector containing the minuends.
2557/// \param __b
2558///    A 128-bit integer vector containing the subtrahends.
2559/// \returns A 128-bit integer vector containing the unsigned integer
2560///    differences of the values in the operands.
2561static __inline__ __m128i __DEFAULT_FN_ATTRS
2562_mm_subs_epu8(__m128i __a, __m128i __b)
2563{
2564  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2565}
2566
2567/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
2568///    and returns the differences in the corresponding bytes in the
2569///    destination. Differences less than 0000h are saturated to 0000h.
2570///
2571/// \headerfile <x86intrin.h>
2572///
2573/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2574///
2575/// \param __a
2576///    A 128-bit integer vector containing the minuends.
2577/// \param __b
2578///    A 128-bit integer vector containing the subtrahends.
2579/// \returns A 128-bit integer vector containing the unsigned integer
2580///    differences of the values in the operands.
2581static __inline__ __m128i __DEFAULT_FN_ATTRS
2582_mm_subs_epu16(__m128i __a, __m128i __b)
2583{
2584  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2585}
2586
2587/// \brief Performs a bitwise AND of two 128-bit integer vectors.
2588///
2589/// \headerfile <x86intrin.h>
2590///
2591/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2592///
2593/// \param __a
2594///    A 128-bit integer vector containing one of the source operands.
2595/// \param __b
2596///    A 128-bit integer vector containing one of the source operands.
2597/// \returns A 128-bit integer vector containing the bitwise AND of the values
2598///    in both operands.
2599static __inline__ __m128i __DEFAULT_FN_ATTRS
2600_mm_and_si128(__m128i __a, __m128i __b)
2601{
2602  return (__m128i)((__v2du)__a & (__v2du)__b);
2603}
2604
2605/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
2606///    one's complement of the values contained in the first source operand.
2607///
2608/// \headerfile <x86intrin.h>
2609///
2610/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2611///
2612/// \param __a
2613///    A 128-bit vector containing the left source operand. The one's complement
2614///    of this value is used in the bitwise AND.
2615/// \param __b
2616///    A 128-bit vector containing the right source operand.
2617/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2618///    complement of the first operand and the values in the second operand.
2619static __inline__ __m128i __DEFAULT_FN_ATTRS
2620_mm_andnot_si128(__m128i __a, __m128i __b)
2621{
2622  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2623}
2624/// \brief Performs a bitwise OR of two 128-bit integer vectors.
2625///
2626/// \headerfile <x86intrin.h>
2627///
2628/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2629///
2630/// \param __a
2631///    A 128-bit integer vector containing one of the source operands.
2632/// \param __b
2633///    A 128-bit integer vector containing one of the source operands.
2634/// \returns A 128-bit integer vector containing the bitwise OR of the values
2635///    in both operands.
2636static __inline__ __m128i __DEFAULT_FN_ATTRS
2637_mm_or_si128(__m128i __a, __m128i __b)
2638{
2639  return (__m128i)((__v2du)__a | (__v2du)__b);
2640}
2641
2642/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
2643///
2644/// \headerfile <x86intrin.h>
2645///
2646/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2647///
2648/// \param __a
2649///    A 128-bit integer vector containing one of the source operands.
2650/// \param __b
2651///    A 128-bit integer vector containing one of the source operands.
2652/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2653///    values in both operands.
2654static __inline__ __m128i __DEFAULT_FN_ATTRS
2655_mm_xor_si128(__m128i __a, __m128i __b)
2656{
2657  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2658}
2659
2660/// \brief Left-shifts the 128-bit integer vector operand by the specified
2661///    number of bytes. Low-order bits are cleared.
2662///
2663/// \headerfile <x86intrin.h>
2664///
2665/// \code
2666/// __m128i _mm_slli_si128(__m128i a, const int imm);
2667/// \endcode
2668///
2669/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2670///
2671/// \param a
2672///    A 128-bit integer vector containing the source operand.
2673/// \param imm
2674///    An immediate value specifying the number of bytes to left-shift operand
2675///    \a a.
2676/// \returns A 128-bit integer vector containing the left-shifted value.
2677#define _mm_slli_si128(a, imm) __extension__ ({                              \
2678  (__m128i)__builtin_shufflevector(                                          \
2679                                 (__v16qi)_mm_setzero_si128(),               \
2680                                 (__v16qi)(__m128i)(a),                      \
2681                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
2682                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
2683                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
2684                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
2685                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
2686                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
2687                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
2688                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
2689                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
2690                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
2691                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
2692                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
2693                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
2694                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
2695                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
2696                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
2697
2698#define _mm_bslli_si128(a, imm) \
2699  _mm_slli_si128((a), (imm))
2700
2701/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2702///    by the specified number of bits. Low-order bits are cleared.
2703///
2704/// \headerfile <x86intrin.h>
2705///
2706/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2707///
2708/// \param __a
2709///    A 128-bit integer vector containing the source operand.
2710/// \param __count
2711///    An integer value specifying the number of bits to left-shift each value
2712///    in operand \a __a.
2713/// \returns A 128-bit integer vector containing the left-shifted values.
2714static __inline__ __m128i __DEFAULT_FN_ATTRS
2715_mm_slli_epi16(__m128i __a, int __count)
2716{
2717  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2718}
2719
2720/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2721///    by the specified number of bits. Low-order bits are cleared.
2722///
2723/// \headerfile <x86intrin.h>
2724///
2725/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2726///
2727/// \param __a
2728///    A 128-bit integer vector containing the source operand.
2729/// \param __count
2730///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2731///    to left-shift each value in operand \a __a.
2732/// \returns A 128-bit integer vector containing the left-shifted values.
2733static __inline__ __m128i __DEFAULT_FN_ATTRS
2734_mm_sll_epi16(__m128i __a, __m128i __count)
2735{
2736  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2737}
2738
2739/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2740///    by the specified number of bits. Low-order bits are cleared.
2741///
2742/// \headerfile <x86intrin.h>
2743///
2744/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2745///
2746/// \param __a
2747///    A 128-bit integer vector containing the source operand.
2748/// \param __count
2749///    An integer value specifying the number of bits to left-shift each value
2750///    in operand \a __a.
2751/// \returns A 128-bit integer vector containing the left-shifted values.
2752static __inline__ __m128i __DEFAULT_FN_ATTRS
2753_mm_slli_epi32(__m128i __a, int __count)
2754{
2755  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2756}
2757
2758/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2759///    by the specified number of bits. Low-order bits are cleared.
2760///
2761/// \headerfile <x86intrin.h>
2762///
2763/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2764///
2765/// \param __a
2766///    A 128-bit integer vector containing the source operand.
2767/// \param __count
2768///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2769///    to left-shift each value in operand \a __a.
2770/// \returns A 128-bit integer vector containing the left-shifted values.
2771static __inline__ __m128i __DEFAULT_FN_ATTRS
2772_mm_sll_epi32(__m128i __a, __m128i __count)
2773{
2774  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2775}
2776
2777/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2778///    by the specified number of bits. Low-order bits are cleared.
2779///
2780/// \headerfile <x86intrin.h>
2781///
2782/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2783///
2784/// \param __a
2785///    A 128-bit integer vector containing the source operand.
2786/// \param __count
2787///    An integer value specifying the number of bits to left-shift each value
2788///    in operand \a __a.
2789/// \returns A 128-bit integer vector containing the left-shifted values.
2790static __inline__ __m128i __DEFAULT_FN_ATTRS
2791_mm_slli_epi64(__m128i __a, int __count)
2792{
2793  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2794}
2795
2796/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2797///    by the specified number of bits. Low-order bits are cleared.
2798///
2799/// \headerfile <x86intrin.h>
2800///
2801/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2802///
2803/// \param __a
2804///    A 128-bit integer vector containing the source operand.
2805/// \param __count
2806///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2807///    to left-shift each value in operand \a __a.
2808/// \returns A 128-bit integer vector containing the left-shifted values.
2809static __inline__ __m128i __DEFAULT_FN_ATTRS
2810_mm_sll_epi64(__m128i __a, __m128i __count)
2811{
2812  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2813}
2814
2815/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2816///    by the specified number of bits. High-order bits are filled with the sign
2817///    bit of the initial value.
2818///
2819/// \headerfile <x86intrin.h>
2820///
2821/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2822///
2823/// \param __a
2824///    A 128-bit integer vector containing the source operand.
2825/// \param __count
2826///    An integer value specifying the number of bits to right-shift each value
2827///    in operand \a __a.
2828/// \returns A 128-bit integer vector containing the right-shifted values.
2829static __inline__ __m128i __DEFAULT_FN_ATTRS
2830_mm_srai_epi16(__m128i __a, int __count)
2831{
2832  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2833}
2834
2835/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2836///    by the specified number of bits. High-order bits are filled with the sign
2837///    bit of the initial value.
2838///
2839/// \headerfile <x86intrin.h>
2840///
2841/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2842///
2843/// \param __a
2844///    A 128-bit integer vector containing the source operand.
2845/// \param __count
2846///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2847///    to right-shift each value in operand \a __a.
2848/// \returns A 128-bit integer vector containing the right-shifted values.
2849static __inline__ __m128i __DEFAULT_FN_ATTRS
2850_mm_sra_epi16(__m128i __a, __m128i __count)
2851{
2852  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2853}
2854
2855/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2856///    by the specified number of bits. High-order bits are filled with the sign
2857///    bit of the initial value.
2858///
2859/// \headerfile <x86intrin.h>
2860///
2861/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2862///
2863/// \param __a
2864///    A 128-bit integer vector containing the source operand.
2865/// \param __count
2866///    An integer value specifying the number of bits to right-shift each value
2867///    in operand \a __a.
2868/// \returns A 128-bit integer vector containing the right-shifted values.
2869static __inline__ __m128i __DEFAULT_FN_ATTRS
2870_mm_srai_epi32(__m128i __a, int __count)
2871{
2872  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2873}
2874
2875/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2876///    by the specified number of bits. High-order bits are filled with the sign
2877///    bit of the initial value.
2878///
2879/// \headerfile <x86intrin.h>
2880///
2881/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2882///
2883/// \param __a
2884///    A 128-bit integer vector containing the source operand.
2885/// \param __count
2886///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2887///    to right-shift each value in operand \a __a.
2888/// \returns A 128-bit integer vector containing the right-shifted values.
2889static __inline__ __m128i __DEFAULT_FN_ATTRS
2890_mm_sra_epi32(__m128i __a, __m128i __count)
2891{
2892  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2893}
2894
2895/// \brief Right-shifts the 128-bit integer vector operand by the specified
2896///    number of bytes. High-order bits are cleared.
2897///
2898/// \headerfile <x86intrin.h>
2899///
2900/// \code
2901/// __m128i _mm_srli_si128(__m128i a, const int imm);
2902/// \endcode
2903///
2904/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2905///
2906/// \param a
2907///    A 128-bit integer vector containing the source operand.
2908/// \param imm
2909///    An immediate value specifying the number of bytes to right-shift operand
2910///    \a a.
2911/// \returns A 128-bit integer vector containing the right-shifted value.
2912#define _mm_srli_si128(a, imm) __extension__ ({                              \
2913  (__m128i)__builtin_shufflevector(                                          \
2914                                 (__v16qi)(__m128i)(a),                      \
2915                                 (__v16qi)_mm_setzero_si128(),               \
2916                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
2917                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
2918                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
2919                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
2920                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
2921                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
2922                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
2923                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
2924                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
2925                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
2926                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
2927                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
2928                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
2929                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
2930                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
2931                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
2932
2933#define _mm_bsrli_si128(a, imm) \
2934  _mm_srli_si128((a), (imm))
2935
2936/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2937///    operand by the specified number of bits. High-order bits are cleared.
2938///
2939/// \headerfile <x86intrin.h>
2940///
2941/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2942///
2943/// \param __a
2944///    A 128-bit integer vector containing the source operand.
2945/// \param __count
2946///    An integer value specifying the number of bits to right-shift each value
2947///    in operand \a __a.
2948/// \returns A 128-bit integer vector containing the right-shifted values.
2949static __inline__ __m128i __DEFAULT_FN_ATTRS
2950_mm_srli_epi16(__m128i __a, int __count)
2951{
2952  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2953}
2954
2955/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2956///    operand by the specified number of bits. High-order bits are cleared.
2957///
2958/// \headerfile <x86intrin.h>
2959///
2960/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2961///
2962/// \param __a
2963///    A 128-bit integer vector containing the source operand.
2964/// \param __count
2965///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2966///    to right-shift each value in operand \a __a.
2967/// \returns A 128-bit integer vector containing the right-shifted values.
2968static __inline__ __m128i __DEFAULT_FN_ATTRS
2969_mm_srl_epi16(__m128i __a, __m128i __count)
2970{
2971  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2972}
2973
2974/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2975///    operand by the specified number of bits. High-order bits are cleared.
2976///
2977/// \headerfile <x86intrin.h>
2978///
2979/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2980///
2981/// \param __a
2982///    A 128-bit integer vector containing the source operand.
2983/// \param __count
2984///    An integer value specifying the number of bits to right-shift each value
2985///    in operand \a __a.
2986/// \returns A 128-bit integer vector containing the right-shifted values.
2987static __inline__ __m128i __DEFAULT_FN_ATTRS
2988_mm_srli_epi32(__m128i __a, int __count)
2989{
2990  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2991}
2992
2993/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2994///    operand by the specified number of bits. High-order bits are cleared.
2995///
2996/// \headerfile <x86intrin.h>
2997///
2998/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2999///
3000/// \param __a
3001///    A 128-bit integer vector containing the source operand.
3002/// \param __count
3003///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3004///    to right-shift each value in operand \a __a.
3005/// \returns A 128-bit integer vector containing the right-shifted values.
3006static __inline__ __m128i __DEFAULT_FN_ATTRS
3007_mm_srl_epi32(__m128i __a, __m128i __count)
3008{
3009  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3010}
3011
3012/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
3013///    operand by the specified number of bits. High-order bits are cleared.
3014///
3015/// \headerfile <x86intrin.h>
3016///
3017/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3018///
3019/// \param __a
3020///    A 128-bit integer vector containing the source operand.
3021/// \param __count
3022///    An integer value specifying the number of bits to right-shift each value
3023///    in operand \a __a.
3024/// \returns A 128-bit integer vector containing the right-shifted values.
3025static __inline__ __m128i __DEFAULT_FN_ATTRS
3026_mm_srli_epi64(__m128i __a, int __count)
3027{
3028  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3029}
3030
3031/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
3032///    operand by the specified number of bits. High-order bits are cleared.
3033///
3034/// \headerfile <x86intrin.h>
3035///
3036/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3037///
3038/// \param __a
3039///    A 128-bit integer vector containing the source operand.
3040/// \param __count
3041///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3042///    to right-shift each value in operand \a __a.
3043/// \returns A 128-bit integer vector containing the right-shifted values.
3044static __inline__ __m128i __DEFAULT_FN_ATTRS
3045_mm_srl_epi64(__m128i __a, __m128i __count)
3046{
3047  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3048}
3049
3050/// \brief Compares each of the corresponding 8-bit values of the 128-bit
3051///    integer vectors for equality. Each comparison yields 0h for false, FFh
3052///    for true.
3053///
3054/// \headerfile <x86intrin.h>
3055///
3056/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3057///
3058/// \param __a
3059///    A 128-bit integer vector.
3060/// \param __b
3061///    A 128-bit integer vector.
3062/// \returns A 128-bit integer vector containing the comparison results.
3063static __inline__ __m128i __DEFAULT_FN_ATTRS
3064_mm_cmpeq_epi8(__m128i __a, __m128i __b)
3065{
3066  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3067}
3068
3069/// \brief Compares each of the corresponding 16-bit values of the 128-bit
3070///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
3071///    for true.
3072///
3073/// \headerfile <x86intrin.h>
3074///
3075/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3076///
3077/// \param __a
3078///    A 128-bit integer vector.
3079/// \param __b
3080///    A 128-bit integer vector.
3081/// \returns A 128-bit integer vector containing the comparison results.
3082static __inline__ __m128i __DEFAULT_FN_ATTRS
3083_mm_cmpeq_epi16(__m128i __a, __m128i __b)
3084{
3085  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3086}
3087
3088/// \brief Compares each of the corresponding 32-bit values of the 128-bit
3089///    integer vectors for equality. Each comparison yields 0h for false,
3090///    FFFFFFFFh for true.
3091///
3092/// \headerfile <x86intrin.h>
3093///
3094/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3095///
3096/// \param __a
3097///    A 128-bit integer vector.
3098/// \param __b
3099///    A 128-bit integer vector.
3100/// \returns A 128-bit integer vector containing the comparison results.
3101static __inline__ __m128i __DEFAULT_FN_ATTRS
3102_mm_cmpeq_epi32(__m128i __a, __m128i __b)
3103{
3104  return (__m128i)((__v4si)__a == (__v4si)__b);
3105}
3106
3107/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3108///    integer vectors to determine if the values in the first operand are
3109///    greater than those in the second operand. Each comparison yields 0h for
3110///    false, FFh for true.
3111///
3112/// \headerfile <x86intrin.h>
3113///
3114/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3115///
3116/// \param __a
3117///    A 128-bit integer vector.
3118/// \param __b
3119///    A 128-bit integer vector.
3120/// \returns A 128-bit integer vector containing the comparison results.
3121static __inline__ __m128i __DEFAULT_FN_ATTRS
3122_mm_cmpgt_epi8(__m128i __a, __m128i __b)
3123{
3124  /* This function always performs a signed comparison, but __v16qi is a char
3125     which may be signed or unsigned, so use __v16qs. */
3126  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3127}
3128
3129/// \brief Compares each of the corresponding signed 16-bit values of the
3130///    128-bit integer vectors to determine if the values in the first operand
3131///    are greater than those in the second operand. Each comparison yields 0h
3132///    for false, FFFFh for true.
3133///
3134/// \headerfile <x86intrin.h>
3135///
3136/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3137///
3138/// \param __a
3139///    A 128-bit integer vector.
3140/// \param __b
3141///    A 128-bit integer vector.
3142/// \returns A 128-bit integer vector containing the comparison results.
3143static __inline__ __m128i __DEFAULT_FN_ATTRS
3144_mm_cmpgt_epi16(__m128i __a, __m128i __b)
3145{
3146  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3147}
3148
3149/// \brief Compares each of the corresponding signed 32-bit values of the
3150///    128-bit integer vectors to determine if the values in the first operand
3151///    are greater than those in the second operand. Each comparison yields 0h
3152///    for false, FFFFFFFFh for true.
3153///
3154/// \headerfile <x86intrin.h>
3155///
3156/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3157///
3158/// \param __a
3159///    A 128-bit integer vector.
3160/// \param __b
3161///    A 128-bit integer vector.
3162/// \returns A 128-bit integer vector containing the comparison results.
3163static __inline__ __m128i __DEFAULT_FN_ATTRS
3164_mm_cmpgt_epi32(__m128i __a, __m128i __b)
3165{
3166  return (__m128i)((__v4si)__a > (__v4si)__b);
3167}
3168
3169/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3170///    integer vectors to determine if the values in the first operand are less
3171///    than those in the second operand. Each comparison yields 0h for false,
3172///    FFh for true.
3173///
3174/// \headerfile <x86intrin.h>
3175///
3176/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3177///
3178/// \param __a
3179///    A 128-bit integer vector.
3180/// \param __b
3181///    A 128-bit integer vector.
3182/// \returns A 128-bit integer vector containing the comparison results.
3183static __inline__ __m128i __DEFAULT_FN_ATTRS
3184_mm_cmplt_epi8(__m128i __a, __m128i __b)
3185{
3186  return _mm_cmpgt_epi8(__b, __a);
3187}
3188
3189/// \brief Compares each of the corresponding signed 16-bit values of the
3190///    128-bit integer vectors to determine if the values in the first operand
3191///    are less than those in the second operand. Each comparison yields 0h for
3192///    false, FFFFh for true.
3193///
3194/// \headerfile <x86intrin.h>
3195///
3196/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3197///
3198/// \param __a
3199///    A 128-bit integer vector.
3200/// \param __b
3201///    A 128-bit integer vector.
3202/// \returns A 128-bit integer vector containing the comparison results.
3203static __inline__ __m128i __DEFAULT_FN_ATTRS
3204_mm_cmplt_epi16(__m128i __a, __m128i __b)
3205{
3206  return _mm_cmpgt_epi16(__b, __a);
3207}
3208
3209/// \brief Compares each of the corresponding signed 32-bit values of the
3210///    128-bit integer vectors to determine if the values in the first operand
3211///    are less than those in the second operand. Each comparison yields 0h for
3212///    false, FFFFFFFFh for true.
3213///
3214/// \headerfile <x86intrin.h>
3215///
3216/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3217///
3218/// \param __a
3219///    A 128-bit integer vector.
3220/// \param __b
3221///    A 128-bit integer vector.
3222/// \returns A 128-bit integer vector containing the comparison results.
3223static __inline__ __m128i __DEFAULT_FN_ATTRS
3224_mm_cmplt_epi32(__m128i __a, __m128i __b)
3225{
3226  return _mm_cmpgt_epi32(__b, __a);
3227}
3228
3229#ifdef __x86_64__
3230/// \brief Converts a 64-bit signed integer value from the second operand into a
3231///    double-precision value and returns it in the lower element of a [2 x
3232///    double] vector; the upper element of the returned vector is copied from
3233///    the upper element of the first operand.
3234///
3235/// \headerfile <x86intrin.h>
3236///
3237/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3238///
3239/// \param __a
3240///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3241///    copied to the upper 64 bits of the destination.
3242/// \param __b
3243///    A 64-bit signed integer operand containing the value to be converted.
3244/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3245///    converted value of the second operand. The upper 64 bits are copied from
3246///    the upper 64 bits of the first operand.
3247static __inline__ __m128d __DEFAULT_FN_ATTRS
3248_mm_cvtsi64_sd(__m128d __a, long long __b)
3249{
3250  __a[0] = __b;
3251  return __a;
3252}
3253
3254/// \brief Converts the first (lower) element of a vector of [2 x double] into a
3255///    64-bit signed integer value, according to the current rounding mode.
3256///
3257/// \headerfile <x86intrin.h>
3258///
3259/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3260///
3261/// \param __a
3262///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3263///    conversion.
3264/// \returns A 64-bit signed integer containing the converted value.
3265static __inline__ long long __DEFAULT_FN_ATTRS
3266_mm_cvtsd_si64(__m128d __a)
3267{
3268  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3269}
3270
3271/// \brief Converts the first (lower) element of a vector of [2 x double] into a
3272///    64-bit signed integer value, truncating the result when it is inexact.
3273///
3274/// \headerfile <x86intrin.h>
3275///
3276/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3277///   instruction.
3278///
3279/// \param __a
3280///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3281///    conversion.
3282/// \returns A 64-bit signed integer containing the converted value.
3283static __inline__ long long __DEFAULT_FN_ATTRS
3284_mm_cvttsd_si64(__m128d __a)
3285{
3286  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3287}
3288#endif
3289
3290/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
3291///
3292/// \headerfile <x86intrin.h>
3293///
3294/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3295///
3296/// \param __a
3297///    A 128-bit integer vector.
3298/// \returns A 128-bit vector of [4 x float] containing the converted values.
3299static __inline__ __m128 __DEFAULT_FN_ATTRS
3300_mm_cvtepi32_ps(__m128i __a)
3301{
3302  return __builtin_ia32_cvtdq2ps((__v4si)__a);
3303}
3304
3305/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3310///
3311/// \param __a
3312///    A 128-bit vector of [4 x float].
3313/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3314///    values.
3315static __inline__ __m128i __DEFAULT_FN_ATTRS
3316_mm_cvtps_epi32(__m128 __a)
3317{
3318  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3319}
3320
3321/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
3322///    truncating the result when it is inexact.
3323///
3324/// \headerfile <x86intrin.h>
3325///
3326/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3327///   instruction.
3328///
3329/// \param __a
3330///    A 128-bit vector of [4 x float].
3331/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3332static __inline__ __m128i __DEFAULT_FN_ATTRS
3333_mm_cvttps_epi32(__m128 __a)
3334{
3335  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3336}
3337
3338/// \brief Returns a vector of [4 x i32] where the lowest element is the input
3339///    operand and the remaining elements are zero.
3340///
3341/// \headerfile <x86intrin.h>
3342///
3343/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3344///
3345/// \param __a
3346///    A 32-bit signed integer operand.
3347/// \returns A 128-bit vector of [4 x i32].
3348static __inline__ __m128i __DEFAULT_FN_ATTRS
3349_mm_cvtsi32_si128(int __a)
3350{
3351  return (__m128i)(__v4si){ __a, 0, 0, 0 };
3352}
3353
3354#ifdef __x86_64__
3355/// \brief Returns a vector of [2 x i64] where the lower element is the input
3356///    operand and the upper element is zero.
3357///
3358/// \headerfile <x86intrin.h>
3359///
3360/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3361///
3362/// \param __a
3363///    A 64-bit signed integer operand containing the value to be converted.
3364/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3365static __inline__ __m128i __DEFAULT_FN_ATTRS
3366_mm_cvtsi64_si128(long long __a)
3367{
3368  return (__m128i){ __a, 0 };
3369}
3370#endif
3371
3372/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
3373///    32-bit signed integer value.
3374///
3375/// \headerfile <x86intrin.h>
3376///
3377/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3378///
3379/// \param __a
3380///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3381///    destination.
3382/// \returns A 32-bit signed integer containing the moved value.
3383static __inline__ int __DEFAULT_FN_ATTRS
3384_mm_cvtsi128_si32(__m128i __a)
3385{
3386  __v4si __b = (__v4si)__a;
3387  return __b[0];
3388}
3389
3390#ifdef __x86_64__
3391/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
3392///    64-bit signed integer value.
3393///
3394/// \headerfile <x86intrin.h>
3395///
3396/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3397///
3398/// \param __a
3399///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3400///    destination.
3401/// \returns A 64-bit signed integer containing the moved value.
3402static __inline__ long long __DEFAULT_FN_ATTRS
3403_mm_cvtsi128_si64(__m128i __a)
3404{
3405  return __a[0];
3406}
3407#endif
3408
3409/// \brief Moves packed integer values from an aligned 128-bit memory location
3410///    to elements in a 128-bit integer vector.
3411///
3412/// \headerfile <x86intrin.h>
3413///
3414/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3415///
3416/// \param __p
3417///    An aligned pointer to a memory location containing integer values.
3418/// \returns A 128-bit integer vector containing the moved values.
3419static __inline__ __m128i __DEFAULT_FN_ATTRS
3420_mm_load_si128(__m128i const *__p)
3421{
3422  return *__p;
3423}
3424
3425/// \brief Moves packed integer values from an unaligned 128-bit memory location
3426///    to elements in a 128-bit integer vector.
3427///
3428/// \headerfile <x86intrin.h>
3429///
3430/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3431///
3432/// \param __p
3433///    A pointer to a memory location containing integer values.
3434/// \returns A 128-bit integer vector containing the moved values.
3435static __inline__ __m128i __DEFAULT_FN_ATTRS
3436_mm_loadu_si128(__m128i const *__p)
3437{
3438  struct __loadu_si128 {
3439    __m128i __v;
3440  } __attribute__((__packed__, __may_alias__));
3441  return ((struct __loadu_si128*)__p)->__v;
3442}
3443
3444/// \brief Returns a vector of [2 x i64] where the lower element is taken from
3445///    the lower element of the operand, and the upper element is zero.
3446///
3447/// \headerfile <x86intrin.h>
3448///
3449/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3450///
3451/// \param __p
3452///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3453///    the destination.
3454/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3455///    moved value. The higher order bits are cleared.
3456static __inline__ __m128i __DEFAULT_FN_ATTRS
3457_mm_loadl_epi64(__m128i const *__p)
3458{
3459  struct __mm_loadl_epi64_struct {
3460    long long __u;
3461  } __attribute__((__packed__, __may_alias__));
3462  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3463}
3464
3465/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
3466///    This could be used as an argument to another intrinsic function where the
3467///    argument is required but the value is not actually used.
3468///
3469/// \headerfile <x86intrin.h>
3470///
3471/// This intrinsic has no corresponding instruction.
3472///
3473/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3474static __inline__ __m128i __DEFAULT_FN_ATTRS
3475_mm_undefined_si128(void)
3476{
3477  return (__m128i)__builtin_ia32_undef128();
3478}
3479
3480/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3481///    the specified 64-bit integer values.
3482///
3483/// \headerfile <x86intrin.h>
3484///
3485/// This intrinsic is a utility function and does not correspond to a specific
3486///    instruction.
3487///
3488/// \param __q1
3489///    A 64-bit integer value used to initialize the upper 64 bits of the
3490///    destination vector of [2 x i64].
3491/// \param __q0
3492///    A 64-bit integer value used to initialize the lower 64 bits of the
3493///    destination vector of [2 x i64].
3494/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3495///    provided in the operands.
3496static __inline__ __m128i __DEFAULT_FN_ATTRS
3497_mm_set_epi64x(long long __q1, long long __q0)
3498{
3499  return (__m128i){ __q0, __q1 };
3500}
3501
3502/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3503///    the specified 64-bit integer values.
3504///
3505/// \headerfile <x86intrin.h>
3506///
3507/// This intrinsic is a utility function and does not correspond to a specific
3508///    instruction.
3509///
3510/// \param __q1
3511///    A 64-bit integer value used to initialize the upper 64 bits of the
3512///    destination vector of [2 x i64].
3513/// \param __q0
3514///    A 64-bit integer value used to initialize the lower 64 bits of the
3515///    destination vector of [2 x i64].
3516/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3517///    provided in the operands.
3518static __inline__ __m128i __DEFAULT_FN_ATTRS
3519_mm_set_epi64(__m64 __q1, __m64 __q0)
3520{
3521  return (__m128i){ (long long)__q0, (long long)__q1 };
3522}
3523
3524/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3525///    the specified 32-bit integer values.
3526///
3527/// \headerfile <x86intrin.h>
3528///
3529/// This intrinsic is a utility function and does not correspond to a specific
3530///    instruction.
3531///
3532/// \param __i3
3533///    A 32-bit integer value used to initialize bits [127:96] of the
3534///    destination vector.
3535/// \param __i2
3536///    A 32-bit integer value used to initialize bits [95:64] of the destination
3537///    vector.
3538/// \param __i1
3539///    A 32-bit integer value used to initialize bits [63:32] of the destination
3540///    vector.
3541/// \param __i0
3542///    A 32-bit integer value used to initialize bits [31:0] of the destination
3543///    vector.
3544/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3545///    provided in the operands.
3546static __inline__ __m128i __DEFAULT_FN_ATTRS
3547_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3548{
3549  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3550}
3551
3552/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3553///    the specified 16-bit integer values.
3554///
3555/// \headerfile <x86intrin.h>
3556///
3557/// This intrinsic is a utility function and does not correspond to a specific
3558///    instruction.
3559///
3560/// \param __w7
3561///    A 16-bit integer value used to initialize bits [127:112] of the
3562///    destination vector.
3563/// \param __w6
3564///    A 16-bit integer value used to initialize bits [111:96] of the
3565///    destination vector.
3566/// \param __w5
3567///    A 16-bit integer value used to initialize bits [95:80] of the destination
3568///    vector.
3569/// \param __w4
3570///    A 16-bit integer value used to initialize bits [79:64] of the destination
3571///    vector.
3572/// \param __w3
3573///    A 16-bit integer value used to initialize bits [63:48] of the destination
3574///    vector.
3575/// \param __w2
3576///    A 16-bit integer value used to initialize bits [47:32] of the destination
3577///    vector.
3578/// \param __w1
3579///    A 16-bit integer value used to initialize bits [31:16] of the destination
3580///    vector.
3581/// \param __w0
3582///    A 16-bit integer value used to initialize bits [15:0] of the destination
3583///    vector.
3584/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3585///    provided in the operands.
3586static __inline__ __m128i __DEFAULT_FN_ATTRS
3587_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3588{
3589  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3590}
3591
3592/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3593///    the specified 8-bit integer values.
3594///
3595/// \headerfile <x86intrin.h>
3596///
3597/// This intrinsic is a utility function and does not correspond to a specific
3598///    instruction.
3599///
3600/// \param __b15
3601///    Initializes bits [127:120] of the destination vector.
3602/// \param __b14
3603///    Initializes bits [119:112] of the destination vector.
3604/// \param __b13
3605///    Initializes bits [111:104] of the destination vector.
3606/// \param __b12
3607///    Initializes bits [103:96] of the destination vector.
3608/// \param __b11
3609///    Initializes bits [95:88] of the destination vector.
3610/// \param __b10
3611///    Initializes bits [87:80] of the destination vector.
3612/// \param __b9
3613///    Initializes bits [79:72] of the destination vector.
3614/// \param __b8
3615///    Initializes bits [71:64] of the destination vector.
3616/// \param __b7
3617///    Initializes bits [63:56] of the destination vector.
3618/// \param __b6
3619///    Initializes bits [55:48] of the destination vector.
3620/// \param __b5
3621///    Initializes bits [47:40] of the destination vector.
3622/// \param __b4
3623///    Initializes bits [39:32] of the destination vector.
3624/// \param __b3
3625///    Initializes bits [31:24] of the destination vector.
3626/// \param __b2
3627///    Initializes bits [23:16] of the destination vector.
3628/// \param __b1
3629///    Initializes bits [15:8] of the destination vector.
3630/// \param __b0
3631///    Initializes bits [7:0] of the destination vector.
3632/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3633///    provided in the operands.
3634static __inline__ __m128i __DEFAULT_FN_ATTRS
3635_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3636{
3637  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3638}
3639
3640/// \brief Initializes both values in a 128-bit integer vector with the
3641///    specified 64-bit integer value.
3642///
3643/// \headerfile <x86intrin.h>
3644///
3645/// This intrinsic is a utility function and does not correspond to a specific
3646///    instruction.
3647///
3648/// \param __q
3649///    Integer value used to initialize the elements of the destination integer
3650///    vector.
3651/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3652///    elements containing the value provided in the operand.
3653static __inline__ __m128i __DEFAULT_FN_ATTRS
3654_mm_set1_epi64x(long long __q)
3655{
3656  return (__m128i){ __q, __q };
3657}
3658
3659/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
3660///    specified 64-bit value.
3661///
3662/// \headerfile <x86intrin.h>
3663///
3664/// This intrinsic is a utility function and does not correspond to a specific
3665///    instruction.
3666///
3667/// \param __q
3668///    A 64-bit value used to initialize the elements of the destination integer
3669///    vector.
3670/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3671///    containing the value provided in the operand.
3672static __inline__ __m128i __DEFAULT_FN_ATTRS
3673_mm_set1_epi64(__m64 __q)
3674{
3675  return (__m128i){ (long long)__q, (long long)__q };
3676}
3677
3678/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
3679///    specified 32-bit value.
3680///
3681/// \headerfile <x86intrin.h>
3682///
3683/// This intrinsic is a utility function and does not correspond to a specific
3684///    instruction.
3685///
3686/// \param __i
3687///    A 32-bit value used to initialize the elements of the destination integer
3688///    vector.
3689/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3690///    containing the value provided in the operand.
3691static __inline__ __m128i __DEFAULT_FN_ATTRS
3692_mm_set1_epi32(int __i)
3693{
3694  return (__m128i)(__v4si){ __i, __i, __i, __i };
3695}
3696
3697/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
3698///    specified 16-bit value.
3699///
3700/// \headerfile <x86intrin.h>
3701///
3702/// This intrinsic is a utility function and does not correspond to a specific
3703///    instruction.
3704///
3705/// \param __w
3706///    A 16-bit value used to initialize the elements of the destination integer
3707///    vector.
3708/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3709///    containing the value provided in the operand.
3710static __inline__ __m128i __DEFAULT_FN_ATTRS
3711_mm_set1_epi16(short __w)
3712{
3713  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
3714}
3715
3716/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
3717///    specified 8-bit value.
3718///
3719/// \headerfile <x86intrin.h>
3720///
3721/// This intrinsic is a utility function and does not correspond to a specific
3722///    instruction.
3723///
3724/// \param __b
3725///    An 8-bit value used to initialize the elements of the destination integer
3726///    vector.
3727/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3728///    containing the value provided in the operand.
3729static __inline__ __m128i __DEFAULT_FN_ATTRS
3730_mm_set1_epi8(char __b)
3731{
3732  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
3733}
3734
3735/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3736///     with the specified 64-bit integral values.
3737///
3738/// \headerfile <x86intrin.h>
3739///
3740/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
3741///   instruction.
3742///
3743/// \param __q0
3744///    A 64-bit integral value used to initialize the lower 64 bits of the
3745///    result.
3746/// \param __q1
3747///    A 64-bit integral value used to initialize the upper 64 bits of the
3748///    result.
3749/// \returns An initialized 128-bit integer vector.
3750static __inline__ __m128i __DEFAULT_FN_ATTRS
3751_mm_setr_epi64(__m64 __q0, __m64 __q1)
3752{
3753  return (__m128i){ (long long)__q0, (long long)__q1 };
3754}
3755
3756/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3757///     with the specified 32-bit integral values.
3758///
3759/// \headerfile <x86intrin.h>
3760///
3761/// This intrinsic is a utility function and does not correspond to a specific
3762///    instruction.
3763///
3764/// \param __i0
3765///    A 32-bit integral value used to initialize bits [31:0] of the result.
3766/// \param __i1
3767///    A 32-bit integral value used to initialize bits [63:32] of the result.
3768/// \param __i2
3769///    A 32-bit integral value used to initialize bits [95:64] of the result.
3770/// \param __i3
3771///    A 32-bit integral value used to initialize bits [127:96] of the result.
3772/// \returns An initialized 128-bit integer vector.
3773static __inline__ __m128i __DEFAULT_FN_ATTRS
3774_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3775{
3776  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3777}
3778
3779/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3780///     with the specified 16-bit integral values.
3781///
3782/// \headerfile <x86intrin.h>
3783///
3784/// This intrinsic is a utility function and does not correspond to a specific
3785///    instruction.
3786///
3787/// \param __w0
3788///    A 16-bit integral value used to initialize bits [15:0] of the result.
3789/// \param __w1
3790///    A 16-bit integral value used to initialize bits [31:16] of the result.
3791/// \param __w2
3792///    A 16-bit integral value used to initialize bits [47:32] of the result.
3793/// \param __w3
3794///    A 16-bit integral value used to initialize bits [63:48] of the result.
3795/// \param __w4
3796///    A 16-bit integral value used to initialize bits [79:64] of the result.
3797/// \param __w5
3798///    A 16-bit integral value used to initialize bits [95:80] of the result.
3799/// \param __w6
3800///    A 16-bit integral value used to initialize bits [111:96] of the result.
3801/// \param __w7
3802///    A 16-bit integral value used to initialize bits [127:112] of the result.
3803/// \returns An initialized 128-bit integer vector.
3804static __inline__ __m128i __DEFAULT_FN_ATTRS
3805_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3806{
3807  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3808}
3809
3810/// \brief Constructs a 128-bit integer vector, initialized in reverse order
3811///     with the specified 8-bit integral values.
3812///
3813/// \headerfile <x86intrin.h>
3814///
3815/// This intrinsic is a utility function and does not correspond to a specific
3816///    instruction.
3817///
3818/// \param __b0
3819///    An 8-bit integral value used to initialize bits [7:0] of the result.
3820/// \param __b1
3821///    An 8-bit integral value used to initialize bits [15:8] of the result.
3822/// \param __b2
3823///    An 8-bit integral value used to initialize bits [23:16] of the result.
3824/// \param __b3
3825///    An 8-bit integral value used to initialize bits [31:24] of the result.
3826/// \param __b4
3827///    An 8-bit integral value used to initialize bits [39:32] of the result.
3828/// \param __b5
3829///    An 8-bit integral value used to initialize bits [47:40] of the result.
3830/// \param __b6
3831///    An 8-bit integral value used to initialize bits [55:48] of the result.
3832/// \param __b7
3833///    An 8-bit integral value used to initialize bits [63:56] of the result.
3834/// \param __b8
3835///    An 8-bit integral value used to initialize bits [71:64] of the result.
3836/// \param __b9
3837///    An 8-bit integral value used to initialize bits [79:72] of the result.
3838/// \param __b10
3839///    An 8-bit integral value used to initialize bits [87:80] of the result.
3840/// \param __b11
3841///    An 8-bit integral value used to initialize bits [95:88] of the result.
3842/// \param __b12
3843///    An 8-bit integral value used to initialize bits [103:96] of the result.
3844/// \param __b13
3845///    An 8-bit integral value used to initialize bits [111:104] of the result.
3846/// \param __b14
3847///    An 8-bit integral value used to initialize bits [119:112] of the result.
3848/// \param __b15
3849///    An 8-bit integral value used to initialize bits [127:120] of the result.
3850/// \returns An initialized 128-bit integer vector.
3851static __inline__ __m128i __DEFAULT_FN_ATTRS
3852_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3853{
3854  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3855}
3856
3857/// \brief Creates a 128-bit integer vector initialized to zero.
3858///
3859/// \headerfile <x86intrin.h>
3860///
3861/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3862///
3863/// \returns An initialized 128-bit integer vector with all elements set to
3864///    zero.
3865static __inline__ __m128i __DEFAULT_FN_ATTRS
3866_mm_setzero_si128(void)
3867{
3868  return (__m128i){ 0LL, 0LL };
3869}
3870
3871/// \brief Stores a 128-bit integer vector to a memory location aligned on a
3872///    128-bit boundary.
3873///
3874/// \headerfile <x86intrin.h>
3875///
3876/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3877///
3878/// \param __p
3879///    A pointer to an aligned memory location that will receive the integer
3880///    values.
3881/// \param __b
3882///    A 128-bit integer vector containing the values to be moved.
3883static __inline__ void __DEFAULT_FN_ATTRS
3884_mm_store_si128(__m128i *__p, __m128i __b)
3885{
3886  *__p = __b;
3887}
3888
3889/// \brief Stores a 128-bit integer vector to an unaligned memory location.
3890///
3891/// \headerfile <x86intrin.h>
3892///
3893/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3894///
3895/// \param __p
3896///    A pointer to a memory location that will receive the integer values.
3897/// \param __b
3898///    A 128-bit integer vector containing the values to be moved.
3899static __inline__ void __DEFAULT_FN_ATTRS
3900_mm_storeu_si128(__m128i *__p, __m128i __b)
3901{
3902  struct __storeu_si128 {
3903    __m128i __v;
3904  } __attribute__((__packed__, __may_alias__));
3905  ((struct __storeu_si128*)__p)->__v = __b;
3906}
3907
3908/// \brief Moves bytes selected by the mask from the first operand to the
3909///    specified unaligned memory location. When a mask bit is 1, the
3910///    corresponding byte is written, otherwise it is not written. To minimize
3911///    caching, the date is flagged as non-temporal (unlikely to be used again
3912///    soon). Exception and trap behavior for elements not selected for storage
3913///    to memory are implementation dependent.
3914///
3915/// \headerfile <x86intrin.h>
3916///
3917/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3918///   instruction.
3919///
3920/// \param __d
3921///    A 128-bit integer vector containing the values to be moved.
3922/// \param __n
3923///    A 128-bit integer vector containing the mask. The most significant bit of
3924///    each byte represents the mask bits.
3925/// \param __p
3926///    A pointer to an unaligned 128-bit memory location where the specified
3927///    values are moved.
3928static __inline__ void __DEFAULT_FN_ATTRS
3929_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
3930{
3931  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3932}
3933
3934/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3935///    a memory location.
3936///
3937/// \headerfile <x86intrin.h>
3938///
3939/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3940///
3941/// \param __p
3942///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3943///    of the integer vector parameter.
3944/// \param __a
3945///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3946///    value to be stored.
3947static __inline__ void __DEFAULT_FN_ATTRS
3948_mm_storel_epi64(__m128i *__p, __m128i __a)
3949{
3950  struct __mm_storel_epi64_struct {
3951    long long __u;
3952  } __attribute__((__packed__, __may_alias__));
3953  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
3954}
3955
3956/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3957///    aligned memory location. To minimize caching, the data is flagged as
3958///    non-temporal (unlikely to be used again soon).
3959///
3960/// \headerfile <x86intrin.h>
3961///
3962/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3963///
3964/// \param __p
3965///    A pointer to the 128-bit aligned memory location used to store the value.
3966/// \param __a
3967///    A vector of [2 x double] containing the 64-bit values to be stored.
3968static __inline__ void __DEFAULT_FN_ATTRS
3969_mm_stream_pd(double *__p, __m128d __a)
3970{
3971  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
3972}
3973
3974/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
3975///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3976///    used again soon).
3977///
3978/// \headerfile <x86intrin.h>
3979///
3980/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3981///
3982/// \param __p
3983///    A pointer to the 128-bit aligned memory location used to store the value.
3984/// \param __a
3985///    A 128-bit integer vector containing the values to be stored.
3986static __inline__ void __DEFAULT_FN_ATTRS
3987_mm_stream_si128(__m128i *__p, __m128i __a)
3988{
3989  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
3990}
3991
3992/// \brief Stores a 32-bit integer value in the specified memory location. To
3993///    minimize caching, the data is flagged as non-temporal (unlikely to be
3994///    used again soon).
3995///
3996/// \headerfile <x86intrin.h>
3997///
3998/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3999///
4000/// \param __p
4001///    A pointer to the 32-bit memory location used to store the value.
4002/// \param __a
4003///    A 32-bit integer containing the value to be stored.
4004static __inline__ void __DEFAULT_FN_ATTRS
4005_mm_stream_si32(int *__p, int __a)
4006{
4007  __builtin_ia32_movnti(__p, __a);
4008}
4009
4010#ifdef __x86_64__
4011/// \brief Stores a 64-bit integer value in the specified memory location. To
4012///    minimize caching, the data is flagged as non-temporal (unlikely to be
4013///    used again soon).
4014///
4015/// \headerfile <x86intrin.h>
4016///
4017/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4018///
4019/// \param __p
4020///    A pointer to the 64-bit memory location used to store the value.
4021/// \param __a
4022///    A 64-bit integer containing the value to be stored.
4023static __inline__ void __DEFAULT_FN_ATTRS
4024_mm_stream_si64(long long *__p, long long __a)
4025{
4026  __builtin_ia32_movnti64(__p, __a);
4027}
4028#endif
4029
4030#if defined(__cplusplus)
4031extern "C" {
4032#endif
4033
4034/// \brief The cache line containing \a __p is flushed and invalidated from all
4035///    caches in the coherency domain.
4036///
4037/// \headerfile <x86intrin.h>
4038///
4039/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4040///
4041/// \param __p
4042///    A pointer to the memory location used to identify the cache line to be
4043///    flushed.
4044void _mm_clflush(void const * __p);
4045
4046/// \brief Forces strong memory ordering (serialization) between load
4047///    instructions preceding this instruction and load instructions following
4048///    this instruction, ensuring the system completes all previous loads before
4049///    executing subsequent loads.
4050///
4051/// \headerfile <x86intrin.h>
4052///
4053/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4054///
4055void _mm_lfence(void);
4056
4057/// \brief Forces strong memory ordering (serialization) between load and store
4058///    instructions preceding this instruction and load and store instructions
4059///    following this instruction, ensuring that the system completes all
4060///    previous memory accesses before executing subsequent memory accesses.
4061///
4062/// \headerfile <x86intrin.h>
4063///
4064/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4065///
4066void _mm_mfence(void);
4067
4068#if defined(__cplusplus)
4069} // extern "C"
4070#endif
4071
4072/// \brief Converts 16-bit signed integers from both 128-bit integer vector
4073///    operands into 8-bit signed integers, and packs the results into the
4074///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4075///    Negative values less than 0x80 are saturated to 0x80.
4076///
4077/// \headerfile <x86intrin.h>
4078///
4079/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4080///
4081/// \param __a
4082///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4083///   a signed integer and is converted to a 8-bit signed integer with
4084///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4085///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4086///   written to the lower 64 bits of the result.
4087/// \param __b
4088///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4089///   a signed integer and is converted to a 8-bit signed integer with
4090///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4091///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4092///   written to the higher 64 bits of the result.
4093/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4094static __inline__ __m128i __DEFAULT_FN_ATTRS
4095_mm_packs_epi16(__m128i __a, __m128i __b)
4096{
4097  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4098}
4099
4100/// \brief Converts 32-bit signed integers from both 128-bit integer vector
4101///    operands into 16-bit signed integers, and packs the results into the
4102///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4103///    Negative values less than 0x8000 are saturated to 0x8000.
4104///
4105/// \headerfile <x86intrin.h>
4106///
4107/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4108///
4109/// \param __a
4110///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4111///    a signed integer and is converted to a 16-bit signed integer with
4112///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4113///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4114///    are written to the lower 64 bits of the result.
4115/// \param __b
4116///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4117///    a signed integer and is converted to a 16-bit signed integer with
4118///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4119///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4120///    are written to the higher 64 bits of the result.
4121/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4122static __inline__ __m128i __DEFAULT_FN_ATTRS
4123_mm_packs_epi32(__m128i __a, __m128i __b)
4124{
4125  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4126}
4127
4128/// \brief Converts 16-bit signed integers from both 128-bit integer vector
4129///    operands into 8-bit unsigned integers, and packs the results into the
4130///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4131///    than 0x00 are saturated to 0x00.
4132///
4133/// \headerfile <x86intrin.h>
4134///
4135/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4136///
4137/// \param __a
4138///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4139///    a signed integer and is converted to an 8-bit unsigned integer with
4140///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4141///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4142///    written to the lower 64 bits of the result.
4143/// \param __b
4144///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4145///    a signed integer and is converted to an 8-bit unsigned integer with
4146///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4147///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4148///    written to the higher 64 bits of the result.
4149/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4150static __inline__ __m128i __DEFAULT_FN_ATTRS
4151_mm_packus_epi16(__m128i __a, __m128i __b)
4152{
4153  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4154}
4155
4156/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4157///    the immediate-value parameter as a selector.
4158///
4159/// \headerfile <x86intrin.h>
4160///
4161/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4162///
4163/// \param __a
4164///    A 128-bit integer vector.
4165/// \param __imm
4166///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
4167///    to bits[15:0] of the result. \n
4168///    000: assign values from bits [15:0] of \a __a. \n
4169///    001: assign values from bits [31:16] of \a __a. \n
4170///    010: assign values from bits [47:32] of \a __a. \n
4171///    011: assign values from bits [63:48] of \a __a. \n
4172///    100: assign values from bits [79:64] of \a __a. \n
4173///    101: assign values from bits [95:80] of \a __a. \n
4174///    110: assign values from bits [111:96] of \a __a. \n
4175///    111: assign values from bits [127:112] of \a __a.
4176/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4177///    integer vector parameter and the remaining bits are assigned zeros.
4178static __inline__ int __DEFAULT_FN_ATTRS
4179_mm_extract_epi16(__m128i __a, int __imm)
4180{
4181  __v8hi __b = (__v8hi)__a;
4182  return (unsigned short)__b[__imm & 7];
4183}
4184
4185/// \brief Constructs a 128-bit integer vector by first making a copy of the
4186///    128-bit integer vector parameter, and then inserting the lower 16 bits
4187///    of an integer parameter into an offset specified by the immediate-value
4188///    parameter.
4189///
4190/// \headerfile <x86intrin.h>
4191///
4192/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4193///
4194/// \param __a
4195///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4196///    result and then one of the eight elements in the result is replaced by
4197///    the lower 16 bits of \a __b.
4198/// \param __b
4199///    An integer. The lower 16 bits of this parameter are written to the
4200///    result beginning at an offset specified by \a __imm.
4201/// \param __imm
4202///    An immediate value specifying the bit offset in the result at which the
4203///    lower 16 bits of \a __b are written.
4204/// \returns A 128-bit integer vector containing the constructed values.
4205static __inline__ __m128i __DEFAULT_FN_ATTRS
4206_mm_insert_epi16(__m128i __a, int __b, int __imm)
4207{
4208  __v8hi __c = (__v8hi)__a;
4209  __c[__imm & 7] = __b;
4210  return (__m128i)__c;
4211}
4212
4213/// \brief Copies the values of the most significant bits from each 8-bit
4214///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4215///    value, zero-extends the value, and writes it to the destination.
4216///
4217/// \headerfile <x86intrin.h>
4218///
4219/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4220///
4221/// \param __a
4222///    A 128-bit integer vector containing the values with bits to be extracted.
4223/// \returns The most significant bits from each 8-bit element in \a __a,
4224///    written to bits [15:0]. The other bits are assigned zeros.
4225static __inline__ int __DEFAULT_FN_ATTRS
4226_mm_movemask_epi8(__m128i __a)
4227{
4228  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4229}
4230
4231/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
4232///    elements of a 128-bit integer vector parameter, using the immediate-value
4233///    parameter as a specifier.
4234///
4235/// \headerfile <x86intrin.h>
4236///
4237/// \code
4238/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4239/// \endcode
4240///
4241/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4242///
4243/// \param a
4244///    A 128-bit integer vector containing the values to be copied.
4245/// \param imm
4246///    An immediate value containing an 8-bit value specifying which elements to
4247///    copy from a. The destinations within the 128-bit destination are assigned
4248///    values as follows: \n
4249///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4250///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4251///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4252///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4253///    Bit value assignments: \n
4254///    00: assign values from bits [31:0] of \a a. \n
4255///    01: assign values from bits [63:32] of \a a. \n
4256///    10: assign values from bits [95:64] of \a a. \n
4257///    11: assign values from bits [127:96] of \a a.
4258/// \returns A 128-bit integer vector containing the shuffled values.
4259#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
4260  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
4261                                   (__v4si)_mm_undefined_si128(), \
4262                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4263                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
4264
4265/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
4266///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4267///    value parameter as a specifier.
4268///
4269/// \headerfile <x86intrin.h>
4270///
4271/// \code
4272/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4273/// \endcode
4274///
4275/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4276///
4277/// \param a
4278///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4279///    [127:64] of the result.
4280/// \param imm
4281///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4282///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4283///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4284///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4285///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4286///    Bit value assignments: \n
4287///    00: assign values from bits [15:0] of \a a. \n
4288///    01: assign values from bits [31:16] of \a a. \n
4289///    10: assign values from bits [47:32] of \a a. \n
4290///    11: assign values from bits [63:48] of \a a. \n
4291/// \returns A 128-bit integer vector containing the shuffled values.
4292#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
4293  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4294                                   (__v8hi)_mm_undefined_si128(), \
4295                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4296                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
4297                                   4, 5, 6, 7); })
4298
4299/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
4300///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4301///    value parameter as a specifier.
4302///
4303/// \headerfile <x86intrin.h>
4304///
4305/// \code
4306/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4307/// \endcode
4308///
4309/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4310///
4311/// \param a
4312///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4313///    [63:0] of the result.
4314/// \param imm
4315///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4316///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4317///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4318///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4319///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4320///    Bit value assignments: \n
4321///    00: assign values from bits [79:64] of \a a. \n
4322///    01: assign values from bits [95:80] of \a a. \n
4323///    10: assign values from bits [111:96] of \a a. \n
4324///    11: assign values from bits [127:112] of \a a. \n
4325/// \returns A 128-bit integer vector containing the shuffled values.
4326#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
4327  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4328                                   (__v8hi)_mm_undefined_si128(), \
4329                                   0, 1, 2, 3, \
4330                                   4 + (((imm) >> 0) & 0x3), \
4331                                   4 + (((imm) >> 2) & 0x3), \
4332                                   4 + (((imm) >> 4) & 0x3), \
4333                                   4 + (((imm) >> 6) & 0x3)); })
4334
4335/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
4336///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4337///
4338/// \headerfile <x86intrin.h>
4339///
4340/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4341///   instruction.
4342///
4343/// \param __a
4344///    A 128-bit vector of [16 x i8].
4345///    Bits [71:64] are written to bits [7:0] of the result. \n
4346///    Bits [79:72] are written to bits [23:16] of the result. \n
4347///    Bits [87:80] are written to bits [39:32] of the result. \n
4348///    Bits [95:88] are written to bits [55:48] of the result. \n
4349///    Bits [103:96] are written to bits [71:64] of the result. \n
4350///    Bits [111:104] are written to bits [87:80] of the result. \n
4351///    Bits [119:112] are written to bits [103:96] of the result. \n
4352///    Bits [127:120] are written to bits [119:112] of the result.
4353/// \param __b
4354///    A 128-bit vector of [16 x i8]. \n
4355///    Bits [71:64] are written to bits [15:8] of the result. \n
4356///    Bits [79:72] are written to bits [31:24] of the result. \n
4357///    Bits [87:80] are written to bits [47:40] of the result. \n
4358///    Bits [95:88] are written to bits [63:56] of the result. \n
4359///    Bits [103:96] are written to bits [79:72] of the result. \n
4360///    Bits [111:104] are written to bits [95:88] of the result. \n
4361///    Bits [119:112] are written to bits [111:104] of the result. \n
4362///    Bits [127:120] are written to bits [127:120] of the result.
4363/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4364static __inline__ __m128i __DEFAULT_FN_ATTRS
4365_mm_unpackhi_epi8(__m128i __a, __m128i __b)
4366{
4367  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4368}
4369
4370/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4371///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4372///
4373/// \headerfile <x86intrin.h>
4374///
4375/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4376///   instruction.
4377///
4378/// \param __a
4379///    A 128-bit vector of [8 x i16].
4380///    Bits [79:64] are written to bits [15:0] of the result. \n
4381///    Bits [95:80] are written to bits [47:32] of the result. \n
4382///    Bits [111:96] are written to bits [79:64] of the result. \n
4383///    Bits [127:112] are written to bits [111:96] of the result.
4384/// \param __b
4385///    A 128-bit vector of [8 x i16].
4386///    Bits [79:64] are written to bits [31:16] of the result. \n
4387///    Bits [95:80] are written to bits [63:48] of the result. \n
4388///    Bits [111:96] are written to bits [95:80] of the result. \n
4389///    Bits [127:112] are written to bits [127:112] of the result.
4390/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4391static __inline__ __m128i __DEFAULT_FN_ATTRS
4392_mm_unpackhi_epi16(__m128i __a, __m128i __b)
4393{
4394  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4395}
4396
4397/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4398///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4399///
4400/// \headerfile <x86intrin.h>
4401///
4402/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4403///   instruction.
4404///
4405/// \param __a
4406///    A 128-bit vector of [4 x i32]. \n
4407///    Bits [95:64] are written to bits [31:0] of the destination. \n
4408///    Bits [127:96] are written to bits [95:64] of the destination.
4409/// \param __b
4410///    A 128-bit vector of [4 x i32]. \n
4411///    Bits [95:64] are written to bits [64:32] of the destination. \n
4412///    Bits [127:96] are written to bits [127:96] of the destination.
4413/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4414static __inline__ __m128i __DEFAULT_FN_ATTRS
4415_mm_unpackhi_epi32(__m128i __a, __m128i __b)
4416{
4417  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4418}
4419
4420/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4421///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4422///
4423/// \headerfile <x86intrin.h>
4424///
4425/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4426///   instruction.
4427///
4428/// \param __a
4429///    A 128-bit vector of [2 x i64]. \n
4430///    Bits [127:64] are written to bits [63:0] of the destination.
4431/// \param __b
4432///    A 128-bit vector of [2 x i64]. \n
4433///    Bits [127:64] are written to bits [127:64] of the destination.
4434/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4435static __inline__ __m128i __DEFAULT_FN_ATTRS
4436_mm_unpackhi_epi64(__m128i __a, __m128i __b)
4437{
4438  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4439}
4440
4441/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4442///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4443///
4444/// \headerfile <x86intrin.h>
4445///
4446/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4447///   instruction.
4448///
4449/// \param __a
4450///    A 128-bit vector of [16 x i8]. \n
4451///    Bits [7:0] are written to bits [7:0] of the result. \n
4452///    Bits [15:8] are written to bits [23:16] of the result. \n
4453///    Bits [23:16] are written to bits [39:32] of the result. \n
4454///    Bits [31:24] are written to bits [55:48] of the result. \n
4455///    Bits [39:32] are written to bits [71:64] of the result. \n
4456///    Bits [47:40] are written to bits [87:80] of the result. \n
4457///    Bits [55:48] are written to bits [103:96] of the result. \n
4458///    Bits [63:56] are written to bits [119:112] of the result.
4459/// \param __b
4460///    A 128-bit vector of [16 x i8].
4461///    Bits [7:0] are written to bits [15:8] of the result. \n
4462///    Bits [15:8] are written to bits [31:24] of the result. \n
4463///    Bits [23:16] are written to bits [47:40] of the result. \n
4464///    Bits [31:24] are written to bits [63:56] of the result. \n
4465///    Bits [39:32] are written to bits [79:72] of the result. \n
4466///    Bits [47:40] are written to bits [95:88] of the result. \n
4467///    Bits [55:48] are written to bits [111:104] of the result. \n
4468///    Bits [63:56] are written to bits [127:120] of the result.
4469/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4470static __inline__ __m128i __DEFAULT_FN_ATTRS
4471_mm_unpacklo_epi8(__m128i __a, __m128i __b)
4472{
4473  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4474}
4475
4476/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
4477///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4478///    [8 x i16].
4479///
4480/// \headerfile <x86intrin.h>
4481///
4482/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4483///   instruction.
4484///
4485/// \param __a
4486///    A 128-bit vector of [8 x i16].
4487///    Bits [15:0] are written to bits [15:0] of the result. \n
4488///    Bits [31:16] are written to bits [47:32] of the result. \n
4489///    Bits [47:32] are written to bits [79:64] of the result. \n
4490///    Bits [63:48] are written to bits [111:96] of the result.
4491/// \param __b
4492///    A 128-bit vector of [8 x i16].
4493///    Bits [15:0] are written to bits [31:16] of the result. \n
4494///    Bits [31:16] are written to bits [63:48] of the result. \n
4495///    Bits [47:32] are written to bits [95:80] of the result. \n
4496///    Bits [63:48] are written to bits [127:112] of the result.
4497/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4498static __inline__ __m128i __DEFAULT_FN_ATTRS
4499_mm_unpacklo_epi16(__m128i __a, __m128i __b)
4500{
4501  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4502}
4503
4504/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4505///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4506///
4507/// \headerfile <x86intrin.h>
4508///
4509/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4510///   instruction.
4511///
4512/// \param __a
4513///    A 128-bit vector of [4 x i32]. \n
4514///    Bits [31:0] are written to bits [31:0] of the destination. \n
4515///    Bits [63:32] are written to bits [95:64] of the destination.
4516/// \param __b
4517///    A 128-bit vector of [4 x i32]. \n
4518///    Bits [31:0] are written to bits [64:32] of the destination. \n
4519///    Bits [63:32] are written to bits [127:96] of the destination.
4520/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4521static __inline__ __m128i __DEFAULT_FN_ATTRS
4522_mm_unpacklo_epi32(__m128i __a, __m128i __b)
4523{
4524  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4525}
4526
4527/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
4528///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4529///
4530/// \headerfile <x86intrin.h>
4531///
4532/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4533///   instruction.
4534///
4535/// \param __a
4536///    A 128-bit vector of [2 x i64]. \n
4537///    Bits [63:0] are written to bits [63:0] of the destination. \n
4538/// \param __b
4539///    A 128-bit vector of [2 x i64]. \n
4540///    Bits [63:0] are written to bits [127:64] of the destination. \n
4541/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4542static __inline__ __m128i __DEFAULT_FN_ATTRS
4543_mm_unpacklo_epi64(__m128i __a, __m128i __b)
4544{
4545  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4546}
4547
4548/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4549///    integer.
4550///
4551/// \headerfile <x86intrin.h>
4552///
4553/// This intrinsic has no corresponding instruction.
4554///
4555/// \param __a
4556///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4557///    destination.
4558/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4559static __inline__ __m64 __DEFAULT_FN_ATTRS
4560_mm_movepi64_pi64(__m128i __a)
4561{
4562  return (__m64)__a[0];
4563}
4564
4565/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4566///    upper bits.
4567///
4568/// \headerfile <x86intrin.h>
4569///
4570/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
4571///
4572/// \param __a
4573///    A 64-bit value.
4574/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4575///    the operand. The upper 64 bits are assigned zeros.
4576static __inline__ __m128i __DEFAULT_FN_ATTRS
4577_mm_movpi64_epi64(__m64 __a)
4578{
4579  return (__m128i){ (long long)__a, 0 };
4580}
4581
4582/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4583///    integer vector, zeroing the upper bits.
4584///
4585/// \headerfile <x86intrin.h>
4586///
4587/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4588///
4589/// \param __a
4590///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4591///    destination.
4592/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4593///    the operand. The upper 64 bits are assigned zeros.
4594static __inline__ __m128i __DEFAULT_FN_ATTRS
4595_mm_move_epi64(__m128i __a)
4596{
4597  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
4598}
4599
4600/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4601///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4602///    double].
4603///
4604/// \headerfile <x86intrin.h>
4605///
4606/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4607///
4608/// \param __a
4609///    A 128-bit vector of [2 x double]. \n
4610///    Bits [127:64] are written to bits [63:0] of the destination.
4611/// \param __b
4612///    A 128-bit vector of [2 x double]. \n
4613///    Bits [127:64] are written to bits [127:64] of the destination.
4614/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4615static __inline__ __m128d __DEFAULT_FN_ATTRS
4616_mm_unpackhi_pd(__m128d __a, __m128d __b)
4617{
4618  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4619}
4620
4621/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
4622///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4623///    double].
4624///
4625/// \headerfile <x86intrin.h>
4626///
4627/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4628///
4629/// \param __a
4630///    A 128-bit vector of [2 x double]. \n
4631///    Bits [63:0] are written to bits [63:0] of the destination.
4632/// \param __b
4633///    A 128-bit vector of [2 x double]. \n
4634///    Bits [63:0] are written to bits [127:64] of the destination.
4635/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4636static __inline__ __m128d __DEFAULT_FN_ATTRS
4637_mm_unpacklo_pd(__m128d __a, __m128d __b)
4638{
4639  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4640}
4641
4642/// \brief Extracts the sign bits of the double-precision values in the 128-bit
4643///    vector of [2 x double], zero-extends the value, and writes it to the
4644///    low-order bits of the destination.
4645///
4646/// \headerfile <x86intrin.h>
4647///
4648/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4649///
4650/// \param __a
4651///    A 128-bit vector of [2 x double] containing the values with sign bits to
4652///    be extracted.
4653/// \returns The sign bits from each of the double-precision elements in \a __a,
4654///    written to bits [1:0]. The remaining bits are assigned values of zero.
4655static __inline__ int __DEFAULT_FN_ATTRS
4656_mm_movemask_pd(__m128d __a)
4657{
4658  return __builtin_ia32_movmskpd((__v2df)__a);
4659}
4660
4661
4662/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
4663///    128-bit vector parameters of [2 x double], using the immediate-value
4664///     parameter as a specifier.
4665///
4666/// \headerfile <x86intrin.h>
4667///
4668/// \code
4669/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4670/// \endcode
4671///
4672/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4673///
4674/// \param a
4675///    A 128-bit vector of [2 x double].
4676/// \param b
4677///    A 128-bit vector of [2 x double].
4678/// \param i
4679///    An 8-bit immediate value. The least significant two bits specify which
4680///    elements to copy from a and b: \n
4681///    Bit[0] = 0: lower element of a copied to lower element of result. \n
4682///    Bit[0] = 1: upper element of a copied to lower element of result. \n
4683///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4684///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4685/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4686#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
4687  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4688                                   0 + (((i) >> 0) & 0x1), \
4689                                   2 + (((i) >> 1) & 0x1)); })
4690
4691/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4692///    floating-point vector of [4 x float].
4693///
4694/// \headerfile <x86intrin.h>
4695///
4696/// This intrinsic has no corresponding instruction.
4697///
4698/// \param __a
4699///    A 128-bit floating-point vector of [2 x double].
4700/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4701///    bitwise pattern as the parameter.
4702static __inline__ __m128 __DEFAULT_FN_ATTRS
4703_mm_castpd_ps(__m128d __a)
4704{
4705  return (__m128)__a;
4706}
4707
4708/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4709///    integer vector.
4710///
4711/// \headerfile <x86intrin.h>
4712///
4713/// This intrinsic has no corresponding instruction.
4714///
4715/// \param __a
4716///    A 128-bit floating-point vector of [2 x double].
4717/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4718///    parameter.
4719static __inline__ __m128i __DEFAULT_FN_ATTRS
4720_mm_castpd_si128(__m128d __a)
4721{
4722  return (__m128i)__a;
4723}
4724
4725/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4726///    floating-point vector of [2 x double].
4727///
4728/// \headerfile <x86intrin.h>
4729///
4730/// This intrinsic has no corresponding instruction.
4731///
4732/// \param __a
4733///    A 128-bit floating-point vector of [4 x float].
4734/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4735///    bitwise pattern as the parameter.
4736static __inline__ __m128d __DEFAULT_FN_ATTRS
4737_mm_castps_pd(__m128 __a)
4738{
4739  return (__m128d)__a;
4740}
4741
4742/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4743///    integer vector.
4744///
4745/// \headerfile <x86intrin.h>
4746///
4747/// This intrinsic has no corresponding instruction.
4748///
4749/// \param __a
4750///    A 128-bit floating-point vector of [4 x float].
4751/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4752///    parameter.
4753static __inline__ __m128i __DEFAULT_FN_ATTRS
4754_mm_castps_si128(__m128 __a)
4755{
4756  return (__m128i)__a;
4757}
4758
4759/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4760///    of [4 x float].
4761///
4762/// \headerfile <x86intrin.h>
4763///
4764/// This intrinsic has no corresponding instruction.
4765///
4766/// \param __a
4767///    A 128-bit integer vector.
4768/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4769///    bitwise pattern as the parameter.
4770static __inline__ __m128 __DEFAULT_FN_ATTRS
4771_mm_castsi128_ps(__m128i __a)
4772{
4773  return (__m128)__a;
4774}
4775
4776/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4777///    of [2 x double].
4778///
4779/// \headerfile <x86intrin.h>
4780///
4781/// This intrinsic has no corresponding instruction.
4782///
4783/// \param __a
4784///    A 128-bit integer vector.
4785/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4786///    bitwise pattern as the parameter.
4787static __inline__ __m128d __DEFAULT_FN_ATTRS
4788_mm_castsi128_pd(__m128i __a)
4789{
4790  return (__m128d)__a;
4791}
4792
4793#if defined(__cplusplus)
4794extern "C" {
4795#endif
4796
4797/// \brief Indicates that a spin loop is being executed for the purposes of
4798///    optimizing power consumption during the loop.
4799///
4800/// \headerfile <x86intrin.h>
4801///
4802/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4803///
4804void _mm_pause(void);
4805
4806#if defined(__cplusplus)
4807} // extern "C"
4808#endif
4809#undef __DEFAULT_FN_ATTRS
4810
4811#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4812
4813#define _MM_DENORMALS_ZERO_ON   (0x0040)
4814#define _MM_DENORMALS_ZERO_OFF  (0x0000)
4815
4816#define _MM_DENORMALS_ZERO_MASK (0x0040)
4817
4818#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4819#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4820
4821#endif /* __EMMINTRIN_H */
4822