1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#include <xmmintrin.h>
28
29typedef double __m128d __attribute__((__vector_size__(16)));
30typedef long long __m128i __attribute__((__vector_size__(16)));
31
32/* Type defines.  */
33typedef double __v2df __attribute__ ((__vector_size__ (16)));
34typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35typedef short __v8hi __attribute__((__vector_size__(16)));
36typedef char __v16qi __attribute__((__vector_size__(16)));
37
38/* Unsigned types */
39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42
43/* We need an explicitly signed variant for char. Note that this shouldn't
44 * appear in the interface though. */
45typedef signed char __v16qs __attribute__((__vector_size__(16)));
46
47#include <f16cintrin.h>
48
49/* Define the default attributes for the functions in this file. */
50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
51
52static __inline__ __m128d __DEFAULT_FN_ATTRS
53_mm_add_sd(__m128d __a, __m128d __b)
54{
55  __a[0] += __b[0];
56  return __a;
57}
58
59static __inline__ __m128d __DEFAULT_FN_ATTRS
60_mm_add_pd(__m128d __a, __m128d __b)
61{
62  return (__m128d)((__v2df)__a + (__v2df)__b);
63}
64
65static __inline__ __m128d __DEFAULT_FN_ATTRS
66_mm_sub_sd(__m128d __a, __m128d __b)
67{
68  __a[0] -= __b[0];
69  return __a;
70}
71
72static __inline__ __m128d __DEFAULT_FN_ATTRS
73_mm_sub_pd(__m128d __a, __m128d __b)
74{
75  return (__m128d)((__v2df)__a - (__v2df)__b);
76}
77
78static __inline__ __m128d __DEFAULT_FN_ATTRS
79_mm_mul_sd(__m128d __a, __m128d __b)
80{
81  __a[0] *= __b[0];
82  return __a;
83}
84
85static __inline__ __m128d __DEFAULT_FN_ATTRS
86_mm_mul_pd(__m128d __a, __m128d __b)
87{
88  return (__m128d)((__v2df)__a * (__v2df)__b);
89}
90
91static __inline__ __m128d __DEFAULT_FN_ATTRS
92_mm_div_sd(__m128d __a, __m128d __b)
93{
94  __a[0] /= __b[0];
95  return __a;
96}
97
98static __inline__ __m128d __DEFAULT_FN_ATTRS
99_mm_div_pd(__m128d __a, __m128d __b)
100{
101  return (__m128d)((__v2df)__a / (__v2df)__b);
102}
103
104static __inline__ __m128d __DEFAULT_FN_ATTRS
105_mm_sqrt_sd(__m128d __a, __m128d __b)
106{
107  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
108  return (__m128d) { __c[0], __a[1] };
109}
110
111static __inline__ __m128d __DEFAULT_FN_ATTRS
112_mm_sqrt_pd(__m128d __a)
113{
114  return __builtin_ia32_sqrtpd((__v2df)__a);
115}
116
117static __inline__ __m128d __DEFAULT_FN_ATTRS
118_mm_min_sd(__m128d __a, __m128d __b)
119{
120  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
121}
122
123static __inline__ __m128d __DEFAULT_FN_ATTRS
124_mm_min_pd(__m128d __a, __m128d __b)
125{
126  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
127}
128
129static __inline__ __m128d __DEFAULT_FN_ATTRS
130_mm_max_sd(__m128d __a, __m128d __b)
131{
132  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
133}
134
135static __inline__ __m128d __DEFAULT_FN_ATTRS
136_mm_max_pd(__m128d __a, __m128d __b)
137{
138  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
139}
140
141static __inline__ __m128d __DEFAULT_FN_ATTRS
142_mm_and_pd(__m128d __a, __m128d __b)
143{
144  return (__m128d)((__v4su)__a & (__v4su)__b);
145}
146
147static __inline__ __m128d __DEFAULT_FN_ATTRS
148_mm_andnot_pd(__m128d __a, __m128d __b)
149{
150  return (__m128d)(~(__v4su)__a & (__v4su)__b);
151}
152
153static __inline__ __m128d __DEFAULT_FN_ATTRS
154_mm_or_pd(__m128d __a, __m128d __b)
155{
156  return (__m128d)((__v4su)__a | (__v4su)__b);
157}
158
159static __inline__ __m128d __DEFAULT_FN_ATTRS
160_mm_xor_pd(__m128d __a, __m128d __b)
161{
162  return (__m128d)((__v4su)__a ^ (__v4su)__b);
163}
164
165static __inline__ __m128d __DEFAULT_FN_ATTRS
166_mm_cmpeq_pd(__m128d __a, __m128d __b)
167{
168  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
169}
170
171static __inline__ __m128d __DEFAULT_FN_ATTRS
172_mm_cmplt_pd(__m128d __a, __m128d __b)
173{
174  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
175}
176
177static __inline__ __m128d __DEFAULT_FN_ATTRS
178_mm_cmple_pd(__m128d __a, __m128d __b)
179{
180  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
181}
182
183static __inline__ __m128d __DEFAULT_FN_ATTRS
184_mm_cmpgt_pd(__m128d __a, __m128d __b)
185{
186  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
187}
188
189static __inline__ __m128d __DEFAULT_FN_ATTRS
190_mm_cmpge_pd(__m128d __a, __m128d __b)
191{
192  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
193}
194
195static __inline__ __m128d __DEFAULT_FN_ATTRS
196_mm_cmpord_pd(__m128d __a, __m128d __b)
197{
198  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
199}
200
201static __inline__ __m128d __DEFAULT_FN_ATTRS
202_mm_cmpunord_pd(__m128d __a, __m128d __b)
203{
204  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
205}
206
207static __inline__ __m128d __DEFAULT_FN_ATTRS
208_mm_cmpneq_pd(__m128d __a, __m128d __b)
209{
210  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
211}
212
213static __inline__ __m128d __DEFAULT_FN_ATTRS
214_mm_cmpnlt_pd(__m128d __a, __m128d __b)
215{
216  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
217}
218
219static __inline__ __m128d __DEFAULT_FN_ATTRS
220_mm_cmpnle_pd(__m128d __a, __m128d __b)
221{
222  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
223}
224
225static __inline__ __m128d __DEFAULT_FN_ATTRS
226_mm_cmpngt_pd(__m128d __a, __m128d __b)
227{
228  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
229}
230
231static __inline__ __m128d __DEFAULT_FN_ATTRS
232_mm_cmpnge_pd(__m128d __a, __m128d __b)
233{
234  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
235}
236
237static __inline__ __m128d __DEFAULT_FN_ATTRS
238_mm_cmpeq_sd(__m128d __a, __m128d __b)
239{
240  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
241}
242
243static __inline__ __m128d __DEFAULT_FN_ATTRS
244_mm_cmplt_sd(__m128d __a, __m128d __b)
245{
246  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
247}
248
249static __inline__ __m128d __DEFAULT_FN_ATTRS
250_mm_cmple_sd(__m128d __a, __m128d __b)
251{
252  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
253}
254
255static __inline__ __m128d __DEFAULT_FN_ATTRS
256_mm_cmpgt_sd(__m128d __a, __m128d __b)
257{
258  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
259  return (__m128d) { __c[0], __a[1] };
260}
261
262static __inline__ __m128d __DEFAULT_FN_ATTRS
263_mm_cmpge_sd(__m128d __a, __m128d __b)
264{
265  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
266  return (__m128d) { __c[0], __a[1] };
267}
268
269static __inline__ __m128d __DEFAULT_FN_ATTRS
270_mm_cmpord_sd(__m128d __a, __m128d __b)
271{
272  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
273}
274
275static __inline__ __m128d __DEFAULT_FN_ATTRS
276_mm_cmpunord_sd(__m128d __a, __m128d __b)
277{
278  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
279}
280
281static __inline__ __m128d __DEFAULT_FN_ATTRS
282_mm_cmpneq_sd(__m128d __a, __m128d __b)
283{
284  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
285}
286
287static __inline__ __m128d __DEFAULT_FN_ATTRS
288_mm_cmpnlt_sd(__m128d __a, __m128d __b)
289{
290  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
291}
292
293static __inline__ __m128d __DEFAULT_FN_ATTRS
294_mm_cmpnle_sd(__m128d __a, __m128d __b)
295{
296  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
297}
298
299static __inline__ __m128d __DEFAULT_FN_ATTRS
300_mm_cmpngt_sd(__m128d __a, __m128d __b)
301{
302  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
303  return (__m128d) { __c[0], __a[1] };
304}
305
306static __inline__ __m128d __DEFAULT_FN_ATTRS
307_mm_cmpnge_sd(__m128d __a, __m128d __b)
308{
309  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
310  return (__m128d) { __c[0], __a[1] };
311}
312
313static __inline__ int __DEFAULT_FN_ATTRS
314_mm_comieq_sd(__m128d __a, __m128d __b)
315{
316  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
317}
318
319static __inline__ int __DEFAULT_FN_ATTRS
320_mm_comilt_sd(__m128d __a, __m128d __b)
321{
322  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
323}
324
325static __inline__ int __DEFAULT_FN_ATTRS
326_mm_comile_sd(__m128d __a, __m128d __b)
327{
328  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
329}
330
331static __inline__ int __DEFAULT_FN_ATTRS
332_mm_comigt_sd(__m128d __a, __m128d __b)
333{
334  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
335}
336
337static __inline__ int __DEFAULT_FN_ATTRS
338_mm_comige_sd(__m128d __a, __m128d __b)
339{
340  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
341}
342
343static __inline__ int __DEFAULT_FN_ATTRS
344_mm_comineq_sd(__m128d __a, __m128d __b)
345{
346  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
347}
348
349static __inline__ int __DEFAULT_FN_ATTRS
350_mm_ucomieq_sd(__m128d __a, __m128d __b)
351{
352  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
353}
354
355static __inline__ int __DEFAULT_FN_ATTRS
356_mm_ucomilt_sd(__m128d __a, __m128d __b)
357{
358  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
359}
360
361static __inline__ int __DEFAULT_FN_ATTRS
362_mm_ucomile_sd(__m128d __a, __m128d __b)
363{
364  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
365}
366
367static __inline__ int __DEFAULT_FN_ATTRS
368_mm_ucomigt_sd(__m128d __a, __m128d __b)
369{
370  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
371}
372
373static __inline__ int __DEFAULT_FN_ATTRS
374_mm_ucomige_sd(__m128d __a, __m128d __b)
375{
376  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
377}
378
379static __inline__ int __DEFAULT_FN_ATTRS
380_mm_ucomineq_sd(__m128d __a, __m128d __b)
381{
382  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
383}
384
385static __inline__ __m128 __DEFAULT_FN_ATTRS
386_mm_cvtpd_ps(__m128d __a)
387{
388  return __builtin_ia32_cvtpd2ps((__v2df)__a);
389}
390
391static __inline__ __m128d __DEFAULT_FN_ATTRS
392_mm_cvtps_pd(__m128 __a)
393{
394  return (__m128d) __builtin_convertvector(
395      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
396}
397
398static __inline__ __m128d __DEFAULT_FN_ATTRS
399_mm_cvtepi32_pd(__m128i __a)
400{
401  return (__m128d) __builtin_convertvector(
402      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
403}
404
405static __inline__ __m128i __DEFAULT_FN_ATTRS
406_mm_cvtpd_epi32(__m128d __a)
407{
408  return __builtin_ia32_cvtpd2dq((__v2df)__a);
409}
410
411static __inline__ int __DEFAULT_FN_ATTRS
412_mm_cvtsd_si32(__m128d __a)
413{
414  return __builtin_ia32_cvtsd2si((__v2df)__a);
415}
416
417static __inline__ __m128 __DEFAULT_FN_ATTRS
418_mm_cvtsd_ss(__m128 __a, __m128d __b)
419{
420  __a[0] = __b[0];
421  return __a;
422}
423
424static __inline__ __m128d __DEFAULT_FN_ATTRS
425_mm_cvtsi32_sd(__m128d __a, int __b)
426{
427  __a[0] = __b;
428  return __a;
429}
430
431static __inline__ __m128d __DEFAULT_FN_ATTRS
432_mm_cvtss_sd(__m128d __a, __m128 __b)
433{
434  __a[0] = __b[0];
435  return __a;
436}
437
438static __inline__ __m128i __DEFAULT_FN_ATTRS
439_mm_cvttpd_epi32(__m128d __a)
440{
441  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
442}
443
444static __inline__ int __DEFAULT_FN_ATTRS
445_mm_cvttsd_si32(__m128d __a)
446{
447  return __a[0];
448}
449
450static __inline__ __m64 __DEFAULT_FN_ATTRS
451_mm_cvtpd_pi32(__m128d __a)
452{
453  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
454}
455
456static __inline__ __m64 __DEFAULT_FN_ATTRS
457_mm_cvttpd_pi32(__m128d __a)
458{
459  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
460}
461
462static __inline__ __m128d __DEFAULT_FN_ATTRS
463_mm_cvtpi32_pd(__m64 __a)
464{
465  return __builtin_ia32_cvtpi2pd((__v2si)__a);
466}
467
468static __inline__ double __DEFAULT_FN_ATTRS
469_mm_cvtsd_f64(__m128d __a)
470{
471  return __a[0];
472}
473
474static __inline__ __m128d __DEFAULT_FN_ATTRS
475_mm_load_pd(double const *__dp)
476{
477  return *(__m128d*)__dp;
478}
479
480static __inline__ __m128d __DEFAULT_FN_ATTRS
481_mm_load1_pd(double const *__dp)
482{
483  struct __mm_load1_pd_struct {
484    double __u;
485  } __attribute__((__packed__, __may_alias__));
486  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
487  return (__m128d){ __u, __u };
488}
489
490#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
491
492static __inline__ __m128d __DEFAULT_FN_ATTRS
493_mm_loadr_pd(double const *__dp)
494{
495  __m128d __u = *(__m128d*)__dp;
496  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
497}
498
499static __inline__ __m128d __DEFAULT_FN_ATTRS
500_mm_loadu_pd(double const *__dp)
501{
502  struct __loadu_pd {
503    __m128d __v;
504  } __attribute__((__packed__, __may_alias__));
505  return ((struct __loadu_pd*)__dp)->__v;
506}
507
508static __inline__ __m128i __DEFAULT_FN_ATTRS
509_mm_loadu_si64(void const *__a)
510{
511  struct __loadu_si64 {
512    long long __v;
513  } __attribute__((__packed__, __may_alias__));
514  long long __u = ((struct __loadu_si64*)__a)->__v;
515  return (__m128i){__u, 0L};
516}
517
518static __inline__ __m128d __DEFAULT_FN_ATTRS
519_mm_load_sd(double const *__dp)
520{
521  struct __mm_load_sd_struct {
522    double __u;
523  } __attribute__((__packed__, __may_alias__));
524  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
525  return (__m128d){ __u, 0 };
526}
527
528static __inline__ __m128d __DEFAULT_FN_ATTRS
529_mm_loadh_pd(__m128d __a, double const *__dp)
530{
531  struct __mm_loadh_pd_struct {
532    double __u;
533  } __attribute__((__packed__, __may_alias__));
534  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
535  return (__m128d){ __a[0], __u };
536}
537
538static __inline__ __m128d __DEFAULT_FN_ATTRS
539_mm_loadl_pd(__m128d __a, double const *__dp)
540{
541  struct __mm_loadl_pd_struct {
542    double __u;
543  } __attribute__((__packed__, __may_alias__));
544  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
545  return (__m128d){ __u, __a[1] };
546}
547
548static __inline__ __m128d __DEFAULT_FN_ATTRS
549_mm_undefined_pd(void)
550{
551  return (__m128d)__builtin_ia32_undef128();
552}
553
554static __inline__ __m128d __DEFAULT_FN_ATTRS
555_mm_set_sd(double __w)
556{
557  return (__m128d){ __w, 0 };
558}
559
560static __inline__ __m128d __DEFAULT_FN_ATTRS
561_mm_set1_pd(double __w)
562{
563  return (__m128d){ __w, __w };
564}
565
566static __inline__ __m128d __DEFAULT_FN_ATTRS
567_mm_set_pd(double __w, double __x)
568{
569  return (__m128d){ __x, __w };
570}
571
572static __inline__ __m128d __DEFAULT_FN_ATTRS
573_mm_setr_pd(double __w, double __x)
574{
575  return (__m128d){ __w, __x };
576}
577
578static __inline__ __m128d __DEFAULT_FN_ATTRS
579_mm_setzero_pd(void)
580{
581  return (__m128d){ 0, 0 };
582}
583
584static __inline__ __m128d __DEFAULT_FN_ATTRS
585_mm_move_sd(__m128d __a, __m128d __b)
586{
587  return (__m128d){ __b[0], __a[1] };
588}
589
590static __inline__ void __DEFAULT_FN_ATTRS
591_mm_store_sd(double *__dp, __m128d __a)
592{
593  struct __mm_store_sd_struct {
594    double __u;
595  } __attribute__((__packed__, __may_alias__));
596  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
597}
598
599static __inline__ void __DEFAULT_FN_ATTRS
600_mm_store_pd(double *__dp, __m128d __a)
601{
602  *(__m128d*)__dp = __a;
603}
604
605static __inline__ void __DEFAULT_FN_ATTRS
606_mm_store1_pd(double *__dp, __m128d __a)
607{
608  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
609  _mm_store_pd(__dp, __a);
610}
611
612static __inline__ void __DEFAULT_FN_ATTRS
613_mm_store_pd1(double *__dp, __m128d __a)
614{
615  return _mm_store1_pd(__dp, __a);
616}
617
618static __inline__ void __DEFAULT_FN_ATTRS
619_mm_storeu_pd(double *__dp, __m128d __a)
620{
621  struct __storeu_pd {
622    __m128d __v;
623  } __attribute__((__packed__, __may_alias__));
624  ((struct __storeu_pd*)__dp)->__v = __a;
625}
626
627static __inline__ void __DEFAULT_FN_ATTRS
628_mm_storer_pd(double *__dp, __m128d __a)
629{
630  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
631  *(__m128d *)__dp = __a;
632}
633
634static __inline__ void __DEFAULT_FN_ATTRS
635_mm_storeh_pd(double *__dp, __m128d __a)
636{
637  struct __mm_storeh_pd_struct {
638    double __u;
639  } __attribute__((__packed__, __may_alias__));
640  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
641}
642
643static __inline__ void __DEFAULT_FN_ATTRS
644_mm_storel_pd(double *__dp, __m128d __a)
645{
646  struct __mm_storeh_pd_struct {
647    double __u;
648  } __attribute__((__packed__, __may_alias__));
649  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
650}
651
652static __inline__ __m128i __DEFAULT_FN_ATTRS
653_mm_add_epi8(__m128i __a, __m128i __b)
654{
655  return (__m128i)((__v16qu)__a + (__v16qu)__b);
656}
657
658static __inline__ __m128i __DEFAULT_FN_ATTRS
659_mm_add_epi16(__m128i __a, __m128i __b)
660{
661  return (__m128i)((__v8hu)__a + (__v8hu)__b);
662}
663
664static __inline__ __m128i __DEFAULT_FN_ATTRS
665_mm_add_epi32(__m128i __a, __m128i __b)
666{
667  return (__m128i)((__v4su)__a + (__v4su)__b);
668}
669
670static __inline__ __m64 __DEFAULT_FN_ATTRS
671_mm_add_si64(__m64 __a, __m64 __b)
672{
673  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
674}
675
676static __inline__ __m128i __DEFAULT_FN_ATTRS
677_mm_add_epi64(__m128i __a, __m128i __b)
678{
679  return (__m128i)((__v2du)__a + (__v2du)__b);
680}
681
682static __inline__ __m128i __DEFAULT_FN_ATTRS
683_mm_adds_epi8(__m128i __a, __m128i __b)
684{
685  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
686}
687
688static __inline__ __m128i __DEFAULT_FN_ATTRS
689_mm_adds_epi16(__m128i __a, __m128i __b)
690{
691  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
692}
693
694static __inline__ __m128i __DEFAULT_FN_ATTRS
695_mm_adds_epu8(__m128i __a, __m128i __b)
696{
697  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
698}
699
700static __inline__ __m128i __DEFAULT_FN_ATTRS
701_mm_adds_epu16(__m128i __a, __m128i __b)
702{
703  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
704}
705
706static __inline__ __m128i __DEFAULT_FN_ATTRS
707_mm_avg_epu8(__m128i __a, __m128i __b)
708{
709  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
710}
711
712static __inline__ __m128i __DEFAULT_FN_ATTRS
713_mm_avg_epu16(__m128i __a, __m128i __b)
714{
715  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
716}
717
718static __inline__ __m128i __DEFAULT_FN_ATTRS
719_mm_madd_epi16(__m128i __a, __m128i __b)
720{
721  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
722}
723
724static __inline__ __m128i __DEFAULT_FN_ATTRS
725_mm_max_epi16(__m128i __a, __m128i __b)
726{
727  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
728}
729
730static __inline__ __m128i __DEFAULT_FN_ATTRS
731_mm_max_epu8(__m128i __a, __m128i __b)
732{
733  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
734}
735
736static __inline__ __m128i __DEFAULT_FN_ATTRS
737_mm_min_epi16(__m128i __a, __m128i __b)
738{
739  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
740}
741
742static __inline__ __m128i __DEFAULT_FN_ATTRS
743_mm_min_epu8(__m128i __a, __m128i __b)
744{
745  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
746}
747
748static __inline__ __m128i __DEFAULT_FN_ATTRS
749_mm_mulhi_epi16(__m128i __a, __m128i __b)
750{
751  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
752}
753
754static __inline__ __m128i __DEFAULT_FN_ATTRS
755_mm_mulhi_epu16(__m128i __a, __m128i __b)
756{
757  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
758}
759
760/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
761///    returns a vector containing the low-order 16 bits of each 32-bit product
762///    in the corresponding element.
763///
764/// \headerfile <x86intrin.h>
765///
766/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
767///
768/// \param __a
769///    A 128-bit integer vector containing one of the source operands.
770/// \param __b
771///    A 128-bit integer vector containing one of the source operands.
772/// \returns A 128-bit integer vector containing the products of both operands.
773static __inline__ __m128i __DEFAULT_FN_ATTRS
774_mm_mullo_epi16(__m128i __a, __m128i __b)
775{
776  return (__m128i)((__v8hu)__a * (__v8hu)__b);
777}
778
779/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
780///    of the two 64-bit integer vectors and returns the 64-bit unsigned
781///    product.
782///
783/// \headerfile <x86intrin.h>
784///
785/// This intrinsic corresponds to the \c PMULUDQ instruction.
786///
787/// \param __a
788///    A 64-bit integer containing one of the source operands.
789/// \param __b
790///    A 64-bit integer containing one of the source operands.
791/// \returns A 64-bit integer vector containing the product of both operands.
792static __inline__ __m64 __DEFAULT_FN_ATTRS
793_mm_mul_su32(__m64 __a, __m64 __b)
794{
795  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
796}
797
798/// \brief Multiplies 32-bit unsigned integer values contained in the lower
799///    bits of the corresponding elements of two [2 x i64] vectors, and returns
800///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
801///
802/// \headerfile <x86intrin.h>
803///
804/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
805///
806/// \param __a
807///    A [2 x i64] vector containing one of the source operands.
808/// \param __b
809///    A [2 x i64] vector containing one of the source operands.
810/// \returns A [2 x i64] vector containing the product of both operands.
811static __inline__ __m128i __DEFAULT_FN_ATTRS
812_mm_mul_epu32(__m128i __a, __m128i __b)
813{
814  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
815}
816
817/// \brief Computes the absolute differences of corresponding 8-bit integer
818///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
819///    separately sums the second 8 absolute differences. Packss these two
820///    unsigned 16-bit integer sums into the upper and lower elements of a
821///    [2 x i64] vector.
822///
823/// \headerfile <x86intrin.h>
824///
825/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
826///
827/// \param __a
828///    A 128-bit integer vector containing one of the source operands.
829/// \param __b
830///    A 128-bit integer vector containing one of the source operands.
831/// \returns A [2 x i64] vector containing the sums of the sets of absolute
832///    differences between both operands.
833static __inline__ __m128i __DEFAULT_FN_ATTRS
834_mm_sad_epu8(__m128i __a, __m128i __b)
835{
836  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
837}
838
839/// \brief Subtracts the corresponding 8-bit integer values in the operands.
840///
841/// \headerfile <x86intrin.h>
842///
843/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
844///
845/// \param __a
846///    A 128-bit integer vector containing the minuends.
847/// \param __b
848///    A 128-bit integer vector containing the subtrahends.
849/// \returns A 128-bit integer vector containing the differences of the values
850///    in the operands.
851static __inline__ __m128i __DEFAULT_FN_ATTRS
852_mm_sub_epi8(__m128i __a, __m128i __b)
853{
854  return (__m128i)((__v16qu)__a - (__v16qu)__b);
855}
856
857/// \brief Subtracts the corresponding 16-bit integer values in the operands.
858///
859/// \headerfile <x86intrin.h>
860///
861/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
862///
863/// \param __a
864///    A 128-bit integer vector containing the minuends.
865/// \param __b
866///    A 128-bit integer vector containing the subtrahends.
867/// \returns A 128-bit integer vector containing the differences of the values
868///    in the operands.
869static __inline__ __m128i __DEFAULT_FN_ATTRS
870_mm_sub_epi16(__m128i __a, __m128i __b)
871{
872  return (__m128i)((__v8hu)__a - (__v8hu)__b);
873}
874
875/// \brief Subtracts the corresponding 32-bit integer values in the operands.
876///
877/// \headerfile <x86intrin.h>
878///
879/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
880///
881/// \param __a
882///    A 128-bit integer vector containing the minuends.
883/// \param __b
884///    A 128-bit integer vector containing the subtrahends.
885/// \returns A 128-bit integer vector containing the differences of the values
886///    in the operands.
887static __inline__ __m128i __DEFAULT_FN_ATTRS
888_mm_sub_epi32(__m128i __a, __m128i __b)
889{
890  return (__m128i)((__v4su)__a - (__v4su)__b);
891}
892
893/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
894///    difference to the corresponding bits in the destination.
895///
896/// \headerfile <x86intrin.h>
897///
898/// This intrinsic corresponds to the \c PSUBQ instruction.
899///
900/// \param __a
901///    A 64-bit integer vector containing the minuend.
902/// \param __b
903///    A 64-bit integer vector containing the subtrahend.
904/// \returns A 64-bit integer vector containing the difference of the values in
905///    the operands.
906static __inline__ __m64 __DEFAULT_FN_ATTRS
907_mm_sub_si64(__m64 __a, __m64 __b)
908{
909  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
910}
911
912/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
913///
914/// \headerfile <x86intrin.h>
915///
916/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
917///
918/// \param __a
919///    A 128-bit integer vector containing the minuends.
920/// \param __b
921///    A 128-bit integer vector containing the subtrahends.
922/// \returns A 128-bit integer vector containing the differences of the values
923///    in the operands.
924static __inline__ __m128i __DEFAULT_FN_ATTRS
925_mm_sub_epi64(__m128i __a, __m128i __b)
926{
927  return (__m128i)((__v2du)__a - (__v2du)__b);
928}
929
930/// \brief Subtracts corresponding 8-bit signed integer values in the input and
931///    returns the differences in the corresponding bytes in the destination.
932///    Differences greater than 7Fh are saturated to 7Fh, and differences less
933///    than 80h are saturated to 80h.
934///
935/// \headerfile <x86intrin.h>
936///
937/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
938///
939/// \param __a
940///    A 128-bit integer vector containing the minuends.
941/// \param __b
942///    A 128-bit integer vector containing the subtrahends.
943/// \returns A 128-bit integer vector containing the differences of the values
944///    in the operands.
945static __inline__ __m128i __DEFAULT_FN_ATTRS
946_mm_subs_epi8(__m128i __a, __m128i __b)
947{
948  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
949}
950
951/// \brief Subtracts corresponding 16-bit signed integer values in the input and
952///    returns the differences in the corresponding bytes in the destination.
953///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
954///    than 8000h are saturated to 8000h.
955///
956/// \headerfile <x86intrin.h>
957///
958/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
959///
960/// \param __a
961///    A 128-bit integer vector containing the minuends.
962/// \param __b
963///    A 128-bit integer vector containing the subtrahends.
964/// \returns A 128-bit integer vector containing the differences of the values
965///    in the operands.
966static __inline__ __m128i __DEFAULT_FN_ATTRS
967_mm_subs_epi16(__m128i __a, __m128i __b)
968{
969  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
970}
971
972/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
973///    and returns the differences in the corresponding bytes in the
974///    destination. Differences less than 00h are saturated to 00h.
975///
976/// \headerfile <x86intrin.h>
977///
978/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
979///
980/// \param __a
981///    A 128-bit integer vector containing the minuends.
982/// \param __b
983///    A 128-bit integer vector containing the subtrahends.
984/// \returns A 128-bit integer vector containing the unsigned integer
985///    differences of the values in the operands.
986static __inline__ __m128i __DEFAULT_FN_ATTRS
987_mm_subs_epu8(__m128i __a, __m128i __b)
988{
989  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
990}
991
992/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
993///    and returns the differences in the corresponding bytes in the
994///    destination. Differences less than 0000h are saturated to 0000h.
995///
996/// \headerfile <x86intrin.h>
997///
998/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
999///
1000/// \param __a
1001///    A 128-bit integer vector containing the minuends.
1002/// \param __b
1003///    A 128-bit integer vector containing the subtrahends.
1004/// \returns A 128-bit integer vector containing the unsigned integer
1005///    differences of the values in the operands.
1006static __inline__ __m128i __DEFAULT_FN_ATTRS
1007_mm_subs_epu16(__m128i __a, __m128i __b)
1008{
1009  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
1010}
1011
1012/// \brief Performs a bitwise AND of two 128-bit integer vectors.
1013///
1014/// \headerfile <x86intrin.h>
1015///
1016/// This intrinsic corresponds to the \c VPAND / PAND instruction.
1017///
1018/// \param __a
1019///    A 128-bit integer vector containing one of the source operands.
1020/// \param __b
1021///    A 128-bit integer vector containing one of the source operands.
1022/// \returns A 128-bit integer vector containing the bitwise AND of the values
1023///    in both operands.
1024static __inline__ __m128i __DEFAULT_FN_ATTRS
1025_mm_and_si128(__m128i __a, __m128i __b)
1026{
1027  return (__m128i)((__v2du)__a & (__v2du)__b);
1028}
1029
1030/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
1031///    one's complement of the values contained in the first source operand.
1032///
1033/// \headerfile <x86intrin.h>
1034///
1035/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
1036///
1037/// \param __a
1038///    A 128-bit vector containing the left source operand. The one's complement
1039///    of this value is used in the bitwise AND.
1040/// \param __b
1041///    A 128-bit vector containing the right source operand.
1042/// \returns A 128-bit integer vector containing the bitwise AND of the one's
1043///    complement of the first operand and the values in the second operand.
1044static __inline__ __m128i __DEFAULT_FN_ATTRS
1045_mm_andnot_si128(__m128i __a, __m128i __b)
1046{
1047  return (__m128i)(~(__v2du)__a & (__v2du)__b);
1048}
1049/// \brief Performs a bitwise OR of two 128-bit integer vectors.
1050///
1051/// \headerfile <x86intrin.h>
1052///
1053/// This intrinsic corresponds to the \c VPOR / POR instruction.
1054///
1055/// \param __a
1056///    A 128-bit integer vector containing one of the source operands.
1057/// \param __b
1058///    A 128-bit integer vector containing one of the source operands.
1059/// \returns A 128-bit integer vector containing the bitwise OR of the values
1060///    in both operands.
1061static __inline__ __m128i __DEFAULT_FN_ATTRS
1062_mm_or_si128(__m128i __a, __m128i __b)
1063{
1064  return (__m128i)((__v2du)__a | (__v2du)__b);
1065}
1066
1067/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
1068///
1069/// \headerfile <x86intrin.h>
1070///
1071/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
1072///
1073/// \param __a
1074///    A 128-bit integer vector containing one of the source operands.
1075/// \param __b
1076///    A 128-bit integer vector containing one of the source operands.
1077/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
1078///    values in both operands.
1079static __inline__ __m128i __DEFAULT_FN_ATTRS
1080_mm_xor_si128(__m128i __a, __m128i __b)
1081{
1082  return (__m128i)((__v2du)__a ^ (__v2du)__b);
1083}
1084
1085/// \brief Left-shifts the 128-bit integer vector operand by the specified
1086///    number of bytes. Low-order bits are cleared.
1087///
1088/// \headerfile <x86intrin.h>
1089///
1090/// \code
1091/// __m128i _mm_slli_si128(__m128i a, const int imm);
1092/// \endcode
1093///
1094/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
1095///
1096/// \param a
1097///    A 128-bit integer vector containing the source operand.
1098/// \param imm
1099///    An immediate value specifying the number of bytes to left-shift
1100///    operand a.
1101/// \returns A 128-bit integer vector containing the left-shifted value.
1102#define _mm_slli_si128(a, imm) __extension__ ({                              \
1103  (__m128i)__builtin_shufflevector(                                          \
1104                                 (__v16qi)_mm_setzero_si128(),               \
1105                                 (__v16qi)(__m128i)(a),                      \
1106                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
1107                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
1108                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
1109                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
1110                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
1111                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
1112                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
1113                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
1114                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
1115                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
1116                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
1117                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
1118                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
1119                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
1120                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
1121                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
1122
1123#define _mm_bslli_si128(a, imm) \
1124  _mm_slli_si128((a), (imm))
1125
1126/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1127///    by the specified number of bits. Low-order bits are cleared.
1128///
1129/// \headerfile <x86intrin.h>
1130///
1131/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1132///
1133/// \param __a
1134///    A 128-bit integer vector containing the source operand.
1135/// \param __count
1136///    An integer value specifying the number of bits to left-shift each value
1137///    in operand __a.
1138/// \returns A 128-bit integer vector containing the left-shifted values.
1139static __inline__ __m128i __DEFAULT_FN_ATTRS
1140_mm_slli_epi16(__m128i __a, int __count)
1141{
1142  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
1143}
1144
1145/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1146///    by the specified number of bits. Low-order bits are cleared.
1147///
1148/// \headerfile <x86intrin.h>
1149///
1150/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1151///
1152/// \param __a
1153///    A 128-bit integer vector containing the source operand.
1154/// \param __count
1155///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1156///    to left-shift each value in operand __a.
1157/// \returns A 128-bit integer vector containing the left-shifted values.
1158static __inline__ __m128i __DEFAULT_FN_ATTRS
1159_mm_sll_epi16(__m128i __a, __m128i __count)
1160{
1161  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
1162}
1163
1164/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1165///    by the specified number of bits. Low-order bits are cleared.
1166///
1167/// \headerfile <x86intrin.h>
1168///
1169/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1170///
1171/// \param __a
1172///    A 128-bit integer vector containing the source operand.
1173/// \param __count
1174///    An integer value specifying the number of bits to left-shift each value
1175///    in operand __a.
1176/// \returns A 128-bit integer vector containing the left-shifted values.
1177static __inline__ __m128i __DEFAULT_FN_ATTRS
1178_mm_slli_epi32(__m128i __a, int __count)
1179{
1180  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
1181}
1182
1183/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1184///    by the specified number of bits. Low-order bits are cleared.
1185///
1186/// \headerfile <x86intrin.h>
1187///
1188/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1189///
1190/// \param __a
1191///    A 128-bit integer vector containing the source operand.
1192/// \param __count
1193///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1194///    to left-shift each value in operand __a.
1195/// \returns A 128-bit integer vector containing the left-shifted values.
1196static __inline__ __m128i __DEFAULT_FN_ATTRS
1197_mm_sll_epi32(__m128i __a, __m128i __count)
1198{
1199  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
1200}
1201
1202/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1203///    by the specified number of bits. Low-order bits are cleared.
1204///
1205/// \headerfile <x86intrin.h>
1206///
1207/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1208///
1209/// \param __a
1210///    A 128-bit integer vector containing the source operand.
1211/// \param __count
1212///    An integer value specifying the number of bits to left-shift each value
1213///    in operand __a.
1214/// \returns A 128-bit integer vector containing the left-shifted values.
1215static __inline__ __m128i __DEFAULT_FN_ATTRS
1216_mm_slli_epi64(__m128i __a, int __count)
1217{
1218  return __builtin_ia32_psllqi128((__v2di)__a, __count);
1219}
1220
1221/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1222///    by the specified number of bits. Low-order bits are cleared.
1223///
1224/// \headerfile <x86intrin.h>
1225///
1226/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1227///
1228/// \param __a
1229///    A 128-bit integer vector containing the source operand.
1230/// \param __count
1231///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1232///    to left-shift each value in operand __a.
1233/// \returns A 128-bit integer vector containing the left-shifted values.
1234static __inline__ __m128i __DEFAULT_FN_ATTRS
1235_mm_sll_epi64(__m128i __a, __m128i __count)
1236{
1237  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
1238}
1239
1240/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1241///    by the specified number of bits. High-order bits are filled with the sign
1242///    bit of the initial value.
1243///
1244/// \headerfile <x86intrin.h>
1245///
1246/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1247///
1248/// \param __a
1249///    A 128-bit integer vector containing the source operand.
1250/// \param __count
1251///    An integer value specifying the number of bits to right-shift each value
1252///    in operand __a.
1253/// \returns A 128-bit integer vector containing the right-shifted values.
1254static __inline__ __m128i __DEFAULT_FN_ATTRS
1255_mm_srai_epi16(__m128i __a, int __count)
1256{
1257  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
1258}
1259
1260/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1261///    by the specified number of bits. High-order bits are filled with the sign
1262///    bit of the initial value.
1263///
1264/// \headerfile <x86intrin.h>
1265///
1266/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1267///
1268/// \param __a
1269///    A 128-bit integer vector containing the source operand.
1270/// \param __count
1271///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1272///    to right-shift each value in operand __a.
1273/// \returns A 128-bit integer vector containing the right-shifted values.
1274static __inline__ __m128i __DEFAULT_FN_ATTRS
1275_mm_sra_epi16(__m128i __a, __m128i __count)
1276{
1277  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
1278}
1279
1280/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1281///    by the specified number of bits. High-order bits are filled with the sign
1282///    bit of the initial value.
1283///
1284/// \headerfile <x86intrin.h>
1285///
1286/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1287///
1288/// \param __a
1289///    A 128-bit integer vector containing the source operand.
1290/// \param __count
1291///    An integer value specifying the number of bits to right-shift each value
1292///    in operand __a.
1293/// \returns A 128-bit integer vector containing the right-shifted values.
1294static __inline__ __m128i __DEFAULT_FN_ATTRS
1295_mm_srai_epi32(__m128i __a, int __count)
1296{
1297  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
1298}
1299
1300/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1301///    by the specified number of bits. High-order bits are filled with the sign
1302///    bit of the initial value.
1303///
1304/// \headerfile <x86intrin.h>
1305///
1306/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1307///
1308/// \param __a
1309///    A 128-bit integer vector containing the source operand.
1310/// \param __count
1311///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1312///    to right-shift each value in operand __a.
1313/// \returns A 128-bit integer vector containing the right-shifted values.
1314static __inline__ __m128i __DEFAULT_FN_ATTRS
1315_mm_sra_epi32(__m128i __a, __m128i __count)
1316{
1317  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
1318}
1319
1320/// \brief Right-shifts the 128-bit integer vector operand by the specified
1321///    number of bytes. High-order bits are cleared.
1322///
1323/// \headerfile <x86intrin.h>
1324///
1325/// \code
1326/// __m128i _mm_srli_si128(__m128i a, const int imm);
1327/// \endcode
1328///
1329/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
1330///
1331/// \param a
1332///    A 128-bit integer vector containing the source operand.
1333/// \param imm
1334///    An immediate value specifying the number of bytes to right-shift operand
1335///    a.
1336/// \returns A 128-bit integer vector containing the right-shifted value.
1337#define _mm_srli_si128(a, imm) __extension__ ({                              \
1338  (__m128i)__builtin_shufflevector(                                          \
1339                                 (__v16qi)(__m128i)(a),                      \
1340                                 (__v16qi)_mm_setzero_si128(),               \
1341                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
1342                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
1343                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
1344                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
1345                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
1346                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
1347                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
1348                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
1349                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
1350                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
1351                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
1352                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
1353                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
1354                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
1355                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
1356                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
1357
1358#define _mm_bsrli_si128(a, imm) \
1359  _mm_srli_si128((a), (imm))
1360
1361/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1362///    operand by the specified number of bits. High-order bits are cleared.
1363///
1364/// \headerfile <x86intrin.h>
1365///
1366/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1367///
1368/// \param __a
1369///    A 128-bit integer vector containing the source operand.
1370/// \param __count
1371///    An integer value specifying the number of bits to right-shift each value
1372///    in operand __a.
1373/// \returns A 128-bit integer vector containing the right-shifted values.
1374static __inline__ __m128i __DEFAULT_FN_ATTRS
1375_mm_srli_epi16(__m128i __a, int __count)
1376{
1377  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
1378}
1379
1380/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1381///    operand by the specified number of bits. High-order bits are cleared.
1382///
1383/// \headerfile <x86intrin.h>
1384///
1385/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1386///
1387/// \param __a
1388///    A 128-bit integer vector containing the source operand.
1389/// \param __count
1390///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1391///    to right-shift each value in operand __a.
1392/// \returns A 128-bit integer vector containing the right-shifted values.
1393static __inline__ __m128i __DEFAULT_FN_ATTRS
1394_mm_srl_epi16(__m128i __a, __m128i __count)
1395{
1396  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
1397}
1398
1399/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1400///    operand by the specified number of bits. High-order bits are cleared.
1401///
1402/// \headerfile <x86intrin.h>
1403///
1404/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1405///
1406/// \param __a
1407///    A 128-bit integer vector containing the source operand.
1408/// \param __count
1409///    An integer value specifying the number of bits to right-shift each value
1410///    in operand __a.
1411/// \returns A 128-bit integer vector containing the right-shifted values.
1412static __inline__ __m128i __DEFAULT_FN_ATTRS
1413_mm_srli_epi32(__m128i __a, int __count)
1414{
1415  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
1416}
1417
1418/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1419///    operand by the specified number of bits. High-order bits are cleared.
1420///
1421/// \headerfile <x86intrin.h>
1422///
1423/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1424///
1425/// \param __a
1426///    A 128-bit integer vector containing the source operand.
1427/// \param __count
1428///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1429///    to right-shift each value in operand __a.
1430/// \returns A 128-bit integer vector containing the right-shifted values.
1431static __inline__ __m128i __DEFAULT_FN_ATTRS
1432_mm_srl_epi32(__m128i __a, __m128i __count)
1433{
1434  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
1435}
1436
1437/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1438///    operand by the specified number of bits. High-order bits are cleared.
1439///
1440/// \headerfile <x86intrin.h>
1441///
1442/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1443///
1444/// \param __a
1445///    A 128-bit integer vector containing the source operand.
1446/// \param __count
1447///    An integer value specifying the number of bits to right-shift each value
1448///    in operand __a.
1449/// \returns A 128-bit integer vector containing the right-shifted values.
1450static __inline__ __m128i __DEFAULT_FN_ATTRS
1451_mm_srli_epi64(__m128i __a, int __count)
1452{
1453  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
1454}
1455
1456/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1457///    operand by the specified number of bits. High-order bits are cleared.
1458///
1459/// \headerfile <x86intrin.h>
1460///
1461/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1462///
1463/// \param __a
1464///    A 128-bit integer vector containing the source operand.
1465/// \param __count
1466///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1467///    to right-shift each value in operand __a.
1468/// \returns A 128-bit integer vector containing the right-shifted values.
1469static __inline__ __m128i __DEFAULT_FN_ATTRS
1470_mm_srl_epi64(__m128i __a, __m128i __count)
1471{
1472  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
1473}
1474
1475/// \brief Compares each of the corresponding 8-bit values of the 128-bit
1476///    integer vectors for equality. Each comparison yields 0h for false, FFh
1477///    for true.
1478///
1479/// \headerfile <x86intrin.h>
1480///
1481/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
1482///
1483/// \param __a
1484///    A 128-bit integer vector.
1485/// \param __b
1486///    A 128-bit integer vector.
1487/// \returns A 128-bit integer vector containing the comparison results.
1488static __inline__ __m128i __DEFAULT_FN_ATTRS
1489_mm_cmpeq_epi8(__m128i __a, __m128i __b)
1490{
1491  return (__m128i)((__v16qi)__a == (__v16qi)__b);
1492}
1493
1494/// \brief Compares each of the corresponding 16-bit values of the 128-bit
1495///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
1496///    for true.
1497///
1498/// \headerfile <x86intrin.h>
1499///
1500/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
1501///
1502/// \param __a
1503///    A 128-bit integer vector.
1504/// \param __b
1505///    A 128-bit integer vector.
1506/// \returns A 128-bit integer vector containing the comparison results.
1507static __inline__ __m128i __DEFAULT_FN_ATTRS
1508_mm_cmpeq_epi16(__m128i __a, __m128i __b)
1509{
1510  return (__m128i)((__v8hi)__a == (__v8hi)__b);
1511}
1512
1513/// \brief Compares each of the corresponding 32-bit values of the 128-bit
1514///    integer vectors for equality. Each comparison yields 0h for false,
1515///    FFFFFFFFh for true.
1516///
1517/// \headerfile <x86intrin.h>
1518///
1519/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
1520///
1521/// \param __a
1522///    A 128-bit integer vector.
1523/// \param __b
1524///    A 128-bit integer vector.
1525/// \returns A 128-bit integer vector containing the comparison results.
1526static __inline__ __m128i __DEFAULT_FN_ATTRS
1527_mm_cmpeq_epi32(__m128i __a, __m128i __b)
1528{
1529  return (__m128i)((__v4si)__a == (__v4si)__b);
1530}
1531
1532/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1533///    integer vectors to determine if the values in the first operand are
1534///    greater than those in the second operand. Each comparison yields 0h for
1535///    false, FFh for true.
1536///
1537/// \headerfile <x86intrin.h>
1538///
1539/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1540///
1541/// \param __a
1542///    A 128-bit integer vector.
1543/// \param __b
1544///    A 128-bit integer vector.
1545/// \returns A 128-bit integer vector containing the comparison results.
1546static __inline__ __m128i __DEFAULT_FN_ATTRS
1547_mm_cmpgt_epi8(__m128i __a, __m128i __b)
1548{
1549  /* This function always performs a signed comparison, but __v16qi is a char
1550     which may be signed or unsigned, so use __v16qs. */
1551  return (__m128i)((__v16qs)__a > (__v16qs)__b);
1552}
1553
1554/// \brief Compares each of the corresponding signed 16-bit values of the
1555///    128-bit integer vectors to determine if the values in the first operand
1556///    are greater than those in the second operand. Each comparison yields 0h
1557///    for false, FFFFh for true.
1558///
1559/// \headerfile <x86intrin.h>
1560///
1561/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1562///
1563/// \param __a
1564///    A 128-bit integer vector.
1565/// \param __b
1566///    A 128-bit integer vector.
1567/// \returns A 128-bit integer vector containing the comparison results.
1568static __inline__ __m128i __DEFAULT_FN_ATTRS
1569_mm_cmpgt_epi16(__m128i __a, __m128i __b)
1570{
1571  return (__m128i)((__v8hi)__a > (__v8hi)__b);
1572}
1573
1574/// \brief Compares each of the corresponding signed 32-bit values of the
1575///    128-bit integer vectors to determine if the values in the first operand
1576///    are greater than those in the second operand. Each comparison yields 0h
1577///    for false, FFFFFFFFh for true.
1578///
1579/// \headerfile <x86intrin.h>
1580///
1581/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1582///
1583/// \param __a
1584///    A 128-bit integer vector.
1585/// \param __b
1586///    A 128-bit integer vector.
1587/// \returns A 128-bit integer vector containing the comparison results.
1588static __inline__ __m128i __DEFAULT_FN_ATTRS
1589_mm_cmpgt_epi32(__m128i __a, __m128i __b)
1590{
1591  return (__m128i)((__v4si)__a > (__v4si)__b);
1592}
1593
1594/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1595///    integer vectors to determine if the values in the first operand are less
1596///    than those in the second operand. Each comparison yields 0h for false,
1597///    FFh for true.
1598///
1599/// \headerfile <x86intrin.h>
1600///
1601/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1602///
1603/// \param __a
1604///    A 128-bit integer vector.
1605/// \param __b
1606///    A 128-bit integer vector.
1607/// \returns A 128-bit integer vector containing the comparison results.
1608static __inline__ __m128i __DEFAULT_FN_ATTRS
1609_mm_cmplt_epi8(__m128i __a, __m128i __b)
1610{
1611  return _mm_cmpgt_epi8(__b, __a);
1612}
1613
1614/// \brief Compares each of the corresponding signed 16-bit values of the
1615///    128-bit integer vectors to determine if the values in the first operand
1616///    are less than those in the second operand. Each comparison yields 0h for
1617///    false, FFFFh for true.
1618///
1619/// \headerfile <x86intrin.h>
1620///
1621/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1622///
1623/// \param __a
1624///    A 128-bit integer vector.
1625/// \param __b
1626///    A 128-bit integer vector.
1627/// \returns A 128-bit integer vector containing the comparison results.
1628static __inline__ __m128i __DEFAULT_FN_ATTRS
1629_mm_cmplt_epi16(__m128i __a, __m128i __b)
1630{
1631  return _mm_cmpgt_epi16(__b, __a);
1632}
1633
1634/// \brief Compares each of the corresponding signed 32-bit values of the
1635///    128-bit integer vectors to determine if the values in the first operand
1636///    are less than those in the second operand. Each comparison yields 0h for
1637///    false, FFFFFFFFh for true.
1638///
1639/// \headerfile <x86intrin.h>
1640///
1641/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1642///
1643/// \param __a
1644///    A 128-bit integer vector.
1645/// \param __b
1646///    A 128-bit integer vector.
1647/// \returns A 128-bit integer vector containing the comparison results.
1648static __inline__ __m128i __DEFAULT_FN_ATTRS
1649_mm_cmplt_epi32(__m128i __a, __m128i __b)
1650{
1651  return _mm_cmpgt_epi32(__b, __a);
1652}
1653
1654#ifdef __x86_64__
1655/// \brief Converts a 64-bit signed integer value from the second operand into a
1656///    double-precision value and returns it in the lower element of a [2 x
1657///    double] vector; the upper element of the returned vector is copied from
1658///    the upper element of the first operand.
1659///
1660/// \headerfile <x86intrin.h>
1661///
1662/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1663///
1664/// \param __a
1665///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
1666///    copied to the upper 64 bits of the destination.
1667/// \param __b
1668///    A 64-bit signed integer operand containing the value to be converted.
1669/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
1670///    converted value of the second operand. The upper 64 bits are copied from
1671///    the upper 64 bits of the first operand.
1672static __inline__ __m128d __DEFAULT_FN_ATTRS
1673_mm_cvtsi64_sd(__m128d __a, long long __b)
1674{
1675  __a[0] = __b;
1676  return __a;
1677}
1678
1679/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1680///    64-bit signed integer value, according to the current rounding mode.
1681///
1682/// \headerfile <x86intrin.h>
1683///
1684/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1685///
1686/// \param __a
1687///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1688///    conversion.
1689/// \returns A 64-bit signed integer containing the converted value.
1690static __inline__ long long __DEFAULT_FN_ATTRS
1691_mm_cvtsd_si64(__m128d __a)
1692{
1693  return __builtin_ia32_cvtsd2si64((__v2df)__a);
1694}
1695
1696/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1697///    64-bit signed integer value, truncating the result when it is inexact.
1698///
1699/// \headerfile <x86intrin.h>
1700///
1701/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1702///
1703/// \param __a
1704///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1705///    conversion.
1706/// \returns A 64-bit signed integer containing the converted value.
1707static __inline__ long long __DEFAULT_FN_ATTRS
1708_mm_cvttsd_si64(__m128d __a)
1709{
1710  return __a[0];
1711}
1712#endif
1713
1714/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
1715///
1716/// \headerfile <x86intrin.h>
1717///
1718/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
1719///
1720/// \param __a
1721///    A 128-bit integer vector.
1722/// \returns A 128-bit vector of [4 x float] containing the converted values.
1723static __inline__ __m128 __DEFAULT_FN_ATTRS
1724_mm_cvtepi32_ps(__m128i __a)
1725{
1726  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1727}
1728
1729/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
1730///
1731/// \headerfile <x86intrin.h>
1732///
1733/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
1734///
1735/// \param __a
1736///    A 128-bit vector of [4 x float].
1737/// \returns A 128-bit integer vector of [4 x i32] containing the converted
1738///    values.
1739static __inline__ __m128i __DEFAULT_FN_ATTRS
1740_mm_cvtps_epi32(__m128 __a)
1741{
1742  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
1743}
1744
1745/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
1746///    truncating the result when it is inexact.
1747///
1748/// \headerfile <x86intrin.h>
1749///
1750/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
1751///
1752/// \param __a
1753///    A 128-bit vector of [4 x float].
1754/// \returns A 128-bit vector of [4 x i32] containing the converted values.
1755static __inline__ __m128i __DEFAULT_FN_ATTRS
1756_mm_cvttps_epi32(__m128 __a)
1757{
1758  return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
1759}
1760
1761/// \brief Returns a vector of [4 x i32] where the lowest element is the input
1762///    operand and the remaining elements are zero.
1763///
1764/// \headerfile <x86intrin.h>
1765///
1766/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1767///
1768/// \param __a
1769///    A 32-bit signed integer operand.
1770/// \returns A 128-bit vector of [4 x i32].
1771static __inline__ __m128i __DEFAULT_FN_ATTRS
1772_mm_cvtsi32_si128(int __a)
1773{
1774  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1775}
1776
1777#ifdef __x86_64__
1778/// \brief Returns a vector of [2 x i64] where the lower element is the input
1779///    operand and the upper element is zero.
1780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1784///
1785/// \param __a
1786///    A 64-bit signed integer operand containing the value to be converted.
1787/// \returns A 128-bit vector of [2 x i64] containing the converted value.
1788static __inline__ __m128i __DEFAULT_FN_ATTRS
1789_mm_cvtsi64_si128(long long __a)
1790{
1791  return (__m128i){ __a, 0 };
1792}
1793#endif
1794
1795/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
1796///    32-bit signed integer value.
1797///
1798/// \headerfile <x86intrin.h>
1799///
1800/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1801///
1802/// \param __a
1803///    A vector of [4 x i32]. The least significant 32 bits are moved to the
1804///    destination.
1805/// \returns A 32-bit signed integer containing the moved value.
1806static __inline__ int __DEFAULT_FN_ATTRS
1807_mm_cvtsi128_si32(__m128i __a)
1808{
1809  __v4si __b = (__v4si)__a;
1810  return __b[0];
1811}
1812
1813#ifdef __x86_64__
1814/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
1815///    64-bit signed integer value.
1816///
1817/// \headerfile <x86intrin.h>
1818///
1819/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1820///
1821/// \param __a
1822///    A vector of [2 x i64]. The least significant 64 bits are moved to the
1823///    destination.
1824/// \returns A 64-bit signed integer containing the moved value.
1825static __inline__ long long __DEFAULT_FN_ATTRS
1826_mm_cvtsi128_si64(__m128i __a)
1827{
1828  return __a[0];
1829}
1830#endif
1831
1832/// \brief Moves packed integer values from an aligned 128-bit memory location
1833///    to elements in a 128-bit integer vector.
1834///
1835/// \headerfile <x86intrin.h>
1836///
1837/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
1838///
1839/// \param __p
1840///    An aligned pointer to a memory location containing integer values.
1841/// \returns A 128-bit integer vector containing the moved values.
1842static __inline__ __m128i __DEFAULT_FN_ATTRS
1843_mm_load_si128(__m128i const *__p)
1844{
1845  return *__p;
1846}
1847
1848/// \brief Moves packed integer values from an unaligned 128-bit memory location
1849///    to elements in a 128-bit integer vector.
1850///
1851/// \headerfile <x86intrin.h>
1852///
1853/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
1854///
1855/// \param __p
1856///    A pointer to a memory location containing integer values.
1857/// \returns A 128-bit integer vector containing the moved values.
1858static __inline__ __m128i __DEFAULT_FN_ATTRS
1859_mm_loadu_si128(__m128i const *__p)
1860{
1861  struct __loadu_si128 {
1862    __m128i __v;
1863  } __attribute__((__packed__, __may_alias__));
1864  return ((struct __loadu_si128*)__p)->__v;
1865}
1866
1867/// \brief Returns a vector of [2 x i64] where the lower element is taken from
1868///    the lower element of the operand, and the upper element is zero.
1869///
1870/// \headerfile <x86intrin.h>
1871///
1872/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1873///
1874/// \param __p
1875///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
1876///    the destination.
1877/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
1878///    moved value. The higher order bits are cleared.
1879static __inline__ __m128i __DEFAULT_FN_ATTRS
1880_mm_loadl_epi64(__m128i const *__p)
1881{
1882  struct __mm_loadl_epi64_struct {
1883    long long __u;
1884  } __attribute__((__packed__, __may_alias__));
1885  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1886}
1887
1888/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
1889///    This could be used as an argument to another intrinsic function where the
1890///    argument is required but the value is not actually used.
1891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// This intrinsic has no corresponding instruction.
1895///
1896/// \returns A 128-bit vector of [4 x i32] with unspecified content.
1897static __inline__ __m128i __DEFAULT_FN_ATTRS
1898_mm_undefined_si128(void)
1899{
1900  return (__m128i)__builtin_ia32_undef128();
1901}
1902
1903/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1904///    the specified 64-bit integer values.
1905///
1906/// \headerfile <x86intrin.h>
1907///
1908/// This intrinsic is a utility function and does not correspond to a specific
1909///    instruction.
1910///
1911/// \param __q1
1912///    A 64-bit integer value used to initialize the upper 64 bits of the
1913///    destination vector of [2 x i64].
1914/// \param __q0
1915///    A 64-bit integer value used to initialize the lower 64 bits of the
1916///    destination vector of [2 x i64].
1917/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1918///    provided in the operands.
1919static __inline__ __m128i __DEFAULT_FN_ATTRS
1920_mm_set_epi64x(long long __q1, long long __q0)
1921{
1922  return (__m128i){ __q0, __q1 };
1923}
1924
1925/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1926///    the specified 64-bit integer values.
1927///
1928/// \headerfile <x86intrin.h>
1929///
1930/// This intrinsic is a utility function and does not correspond to a specific
1931///    instruction.
1932///
1933/// \param __q1
1934///    A 64-bit integer value used to initialize the upper 64 bits of the
1935///    destination vector of [2 x i64].
1936/// \param __q0
1937///    A 64-bit integer value used to initialize the lower 64 bits of the
1938///    destination vector of [2 x i64].
1939/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1940///    provided in the operands.
1941static __inline__ __m128i __DEFAULT_FN_ATTRS
1942_mm_set_epi64(__m64 __q1, __m64 __q0)
1943{
1944  return (__m128i){ (long long)__q0, (long long)__q1 };
1945}
1946
1947/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
1948///    the specified 32-bit integer values.
1949///
1950/// \headerfile <x86intrin.h>
1951///
1952/// This intrinsic is a utility function and does not correspond to a specific
1953///    instruction.
1954///
1955/// \param __i3
1956///    A 32-bit integer value used to initialize bits [127:96] of the
1957///    destination vector.
1958/// \param __i2
1959///    A 32-bit integer value used to initialize bits [95:64] of the destination
1960///    vector.
1961/// \param __i1
1962///    A 32-bit integer value used to initialize bits [63:32] of the destination
1963///    vector.
1964/// \param __i0
1965///    A 32-bit integer value used to initialize bits [31:0] of the destination
1966///    vector.
1967/// \returns An initialized 128-bit vector of [4 x i32] containing the values
1968///    provided in the operands.
1969static __inline__ __m128i __DEFAULT_FN_ATTRS
1970_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
1971{
1972  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
1973}
1974
1975/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
1976///    the specified 16-bit integer values.
1977///
1978/// \headerfile <x86intrin.h>
1979///
1980/// This intrinsic is a utility function and does not correspond to a specific
1981///    instruction.
1982///
1983/// \param __w7
1984///    A 16-bit integer value used to initialize bits [127:112] of the
1985///    destination vector.
1986/// \param __w6
1987///    A 16-bit integer value used to initialize bits [111:96] of the
1988///    destination vector.
1989/// \param __w5
1990///    A 16-bit integer value used to initialize bits [95:80] of the destination
1991///    vector.
1992/// \param __w4
1993///    A 16-bit integer value used to initialize bits [79:64] of the destination
1994///    vector.
1995/// \param __w3
1996///    A 16-bit integer value used to initialize bits [63:48] of the destination
1997///    vector.
1998/// \param __w2
1999///    A 16-bit integer value used to initialize bits [47:32] of the destination
2000///    vector.
2001/// \param __w1
2002///    A 16-bit integer value used to initialize bits [31:16] of the destination
2003///    vector.
2004/// \param __w0
2005///    A 16-bit integer value used to initialize bits [15:0] of the destination
2006///    vector.
2007/// \returns An initialized 128-bit vector of [8 x i16] containing the values
2008///    provided in the operands.
2009static __inline__ __m128i __DEFAULT_FN_ATTRS
2010_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
2011{
2012  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2013}
2014
2015/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
2016///    the specified 8-bit integer values.
2017///
2018/// \headerfile <x86intrin.h>
2019///
2020/// This intrinsic is a utility function and does not correspond to a specific
2021///    instruction.
2022///
2023/// \param __b15
2024///    Initializes bits [127:120] of the destination vector.
2025/// \param __b14
2026///    Initializes bits [119:112] of the destination vector.
2027/// \param __b13
2028///    Initializes bits [111:104] of the destination vector.
2029/// \param __b12
2030///    Initializes bits [103:96] of the destination vector.
2031/// \param __b11
2032///    Initializes bits [95:88] of the destination vector.
2033/// \param __b10
2034///    Initializes bits [87:80] of the destination vector.
2035/// \param __b9
2036///    Initializes bits [79:72] of the destination vector.
2037/// \param __b8
2038///    Initializes bits [71:64] of the destination vector.
2039/// \param __b7
2040///    Initializes bits [63:56] of the destination vector.
2041/// \param __b6
2042///    Initializes bits [55:48] of the destination vector.
2043/// \param __b5
2044///    Initializes bits [47:40] of the destination vector.
2045/// \param __b4
2046///    Initializes bits [39:32] of the destination vector.
2047/// \param __b3
2048///    Initializes bits [31:24] of the destination vector.
2049/// \param __b2
2050///    Initializes bits [23:16] of the destination vector.
2051/// \param __b1
2052///    Initializes bits [15:8] of the destination vector.
2053/// \param __b0
2054///    Initializes bits [7:0] of the destination vector.
2055/// \returns An initialized 128-bit vector of [16 x i8] containing the values
2056///    provided in the operands.
2057static __inline__ __m128i __DEFAULT_FN_ATTRS
2058_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
2059{
2060  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2061}
2062
2063/// \brief Initializes both values in a 128-bit integer vector with the
2064///    specified 64-bit integer value.
2065///
2066/// \headerfile <x86intrin.h>
2067///
2068/// This intrinsic is a utility function and does not correspond to a specific
2069///    instruction.
2070///
2071/// \param __q
2072///    Integer value used to initialize the elements of the destination integer
2073///    vector.
2074/// \returns An initialized 128-bit integer vector of [2 x i64] with both
2075///    elements containing the value provided in the operand.
2076static __inline__ __m128i __DEFAULT_FN_ATTRS
2077_mm_set1_epi64x(long long __q)
2078{
2079  return (__m128i){ __q, __q };
2080}
2081
2082/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
2083///    specified 64-bit value.
2084///
2085/// \headerfile <x86intrin.h>
2086///
2087/// This intrinsic is a utility function and does not correspond to a specific
2088///    instruction.
2089///
2090/// \param __q
2091///    A 64-bit value used to initialize the elements of the destination integer
2092///    vector.
2093/// \returns An initialized 128-bit vector of [2 x i64] with all elements
2094///    containing the value provided in the operand.
2095static __inline__ __m128i __DEFAULT_FN_ATTRS
2096_mm_set1_epi64(__m64 __q)
2097{
2098  return (__m128i){ (long long)__q, (long long)__q };
2099}
2100
2101/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
2102///    specified 32-bit value.
2103///
2104/// \headerfile <x86intrin.h>
2105///
2106/// This intrinsic is a utility function and does not correspond to a specific
2107///    instruction.
2108///
2109/// \param __i
2110///    A 32-bit value used to initialize the elements of the destination integer
2111///    vector.
2112/// \returns An initialized 128-bit vector of [4 x i32] with all elements
2113///    containing the value provided in the operand.
2114static __inline__ __m128i __DEFAULT_FN_ATTRS
2115_mm_set1_epi32(int __i)
2116{
2117  return (__m128i)(__v4si){ __i, __i, __i, __i };
2118}
2119
2120/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
2121///    specified 16-bit value.
2122///
2123/// \headerfile <x86intrin.h>
2124///
2125/// This intrinsic is a utility function and does not correspond to a specific
2126///    instruction.
2127///
2128/// \param __w
2129///    A 16-bit value used to initialize the elements of the destination integer
2130///    vector.
2131/// \returns An initialized 128-bit vector of [8 x i16] with all elements
2132///    containing the value provided in the operand.
2133static __inline__ __m128i __DEFAULT_FN_ATTRS
2134_mm_set1_epi16(short __w)
2135{
2136  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
2137}
2138
2139/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
2140///    specified 8-bit value.
2141///
2142/// \headerfile <x86intrin.h>
2143///
2144/// This intrinsic is a utility function and does not correspond to a specific
2145///    instruction.
2146///
2147/// \param __b
2148///    An 8-bit value used to initialize the elements of the destination integer
2149///    vector.
2150/// \returns An initialized 128-bit vector of [16 x i8] with all elements
2151///    containing the value provided in the operand.
2152static __inline__ __m128i __DEFAULT_FN_ATTRS
2153_mm_set1_epi8(char __b)
2154{
2155  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
2156}
2157
2158static __inline__ __m128i __DEFAULT_FN_ATTRS
2159_mm_setr_epi64(__m64 __q0, __m64 __q1)
2160{
2161  return (__m128i){ (long long)__q0, (long long)__q1 };
2162}
2163
2164static __inline__ __m128i __DEFAULT_FN_ATTRS
2165_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
2166{
2167  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
2168}
2169
2170static __inline__ __m128i __DEFAULT_FN_ATTRS
2171_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
2172{
2173  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2174}
2175
2176static __inline__ __m128i __DEFAULT_FN_ATTRS
2177_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
2178{
2179  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2180}
2181
2182static __inline__ __m128i __DEFAULT_FN_ATTRS
2183_mm_setzero_si128(void)
2184{
2185  return (__m128i){ 0LL, 0LL };
2186}
2187
2188static __inline__ void __DEFAULT_FN_ATTRS
2189_mm_store_si128(__m128i *__p, __m128i __b)
2190{
2191  *__p = __b;
2192}
2193
2194static __inline__ void __DEFAULT_FN_ATTRS
2195_mm_storeu_si128(__m128i *__p, __m128i __b)
2196{
2197  struct __storeu_si128 {
2198    __m128i __v;
2199  } __attribute__((__packed__, __may_alias__));
2200  ((struct __storeu_si128*)__p)->__v = __b;
2201}
2202
2203static __inline__ void __DEFAULT_FN_ATTRS
2204_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
2205{
2206  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
2207}
2208
2209static __inline__ void __DEFAULT_FN_ATTRS
2210_mm_storel_epi64(__m128i *__p, __m128i __a)
2211{
2212  struct __mm_storel_epi64_struct {
2213    long long __u;
2214  } __attribute__((__packed__, __may_alias__));
2215  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
2216}
2217
2218static __inline__ void __DEFAULT_FN_ATTRS
2219_mm_stream_pd(double *__p, __m128d __a)
2220{
2221  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
2222}
2223
2224static __inline__ void __DEFAULT_FN_ATTRS
2225_mm_stream_si128(__m128i *__p, __m128i __a)
2226{
2227  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
2228}
2229
2230static __inline__ void __DEFAULT_FN_ATTRS
2231_mm_stream_si32(int *__p, int __a)
2232{
2233  __builtin_ia32_movnti(__p, __a);
2234}
2235
2236#ifdef __x86_64__
2237static __inline__ void __DEFAULT_FN_ATTRS
2238_mm_stream_si64(long long *__p, long long __a)
2239{
2240  __builtin_ia32_movnti64(__p, __a);
2241}
2242#endif
2243
2244static __inline__ void __DEFAULT_FN_ATTRS
2245_mm_clflush(void const *__p)
2246{
2247  __builtin_ia32_clflush(__p);
2248}
2249
2250static __inline__ void __DEFAULT_FN_ATTRS
2251_mm_lfence(void)
2252{
2253  __builtin_ia32_lfence();
2254}
2255
2256static __inline__ void __DEFAULT_FN_ATTRS
2257_mm_mfence(void)
2258{
2259  __builtin_ia32_mfence();
2260}
2261
2262static __inline__ __m128i __DEFAULT_FN_ATTRS
2263_mm_packs_epi16(__m128i __a, __m128i __b)
2264{
2265  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
2266}
2267
2268static __inline__ __m128i __DEFAULT_FN_ATTRS
2269_mm_packs_epi32(__m128i __a, __m128i __b)
2270{
2271  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
2272}
2273
2274static __inline__ __m128i __DEFAULT_FN_ATTRS
2275_mm_packus_epi16(__m128i __a, __m128i __b)
2276{
2277  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
2278}
2279
2280static __inline__ int __DEFAULT_FN_ATTRS
2281_mm_extract_epi16(__m128i __a, int __imm)
2282{
2283  __v8hi __b = (__v8hi)__a;
2284  return (unsigned short)__b[__imm & 7];
2285}
2286
2287static __inline__ __m128i __DEFAULT_FN_ATTRS
2288_mm_insert_epi16(__m128i __a, int __b, int __imm)
2289{
2290  __v8hi __c = (__v8hi)__a;
2291  __c[__imm & 7] = __b;
2292  return (__m128i)__c;
2293}
2294
2295static __inline__ int __DEFAULT_FN_ATTRS
2296_mm_movemask_epi8(__m128i __a)
2297{
2298  return __builtin_ia32_pmovmskb128((__v16qi)__a);
2299}
2300
2301#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
2302  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
2303                                   (__v4si)_mm_undefined_si128(), \
2304                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2305                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
2306
2307#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
2308  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2309                                   (__v8hi)_mm_undefined_si128(), \
2310                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2311                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
2312                                   4, 5, 6, 7); })
2313
2314#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
2315  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2316                                   (__v8hi)_mm_undefined_si128(), \
2317                                   0, 1, 2, 3, \
2318                                   4 + (((imm) >> 0) & 0x3), \
2319                                   4 + (((imm) >> 2) & 0x3), \
2320                                   4 + (((imm) >> 4) & 0x3), \
2321                                   4 + (((imm) >> 6) & 0x3)); })
2322
2323static __inline__ __m128i __DEFAULT_FN_ATTRS
2324_mm_unpackhi_epi8(__m128i __a, __m128i __b)
2325{
2326  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2327}
2328
2329static __inline__ __m128i __DEFAULT_FN_ATTRS
2330_mm_unpackhi_epi16(__m128i __a, __m128i __b)
2331{
2332  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
2333}
2334
2335static __inline__ __m128i __DEFAULT_FN_ATTRS
2336_mm_unpackhi_epi32(__m128i __a, __m128i __b)
2337{
2338  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
2339}
2340
2341static __inline__ __m128i __DEFAULT_FN_ATTRS
2342_mm_unpackhi_epi64(__m128i __a, __m128i __b)
2343{
2344  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
2345}
2346
2347static __inline__ __m128i __DEFAULT_FN_ATTRS
2348_mm_unpacklo_epi8(__m128i __a, __m128i __b)
2349{
2350  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
2351}
2352
2353static __inline__ __m128i __DEFAULT_FN_ATTRS
2354_mm_unpacklo_epi16(__m128i __a, __m128i __b)
2355{
2356  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
2357}
2358
2359static __inline__ __m128i __DEFAULT_FN_ATTRS
2360_mm_unpacklo_epi32(__m128i __a, __m128i __b)
2361{
2362  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
2363}
2364
2365static __inline__ __m128i __DEFAULT_FN_ATTRS
2366_mm_unpacklo_epi64(__m128i __a, __m128i __b)
2367{
2368  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
2369}
2370
2371static __inline__ __m64 __DEFAULT_FN_ATTRS
2372_mm_movepi64_pi64(__m128i __a)
2373{
2374  return (__m64)__a[0];
2375}
2376
2377static __inline__ __m128i __DEFAULT_FN_ATTRS
2378_mm_movpi64_epi64(__m64 __a)
2379{
2380  return (__m128i){ (long long)__a, 0 };
2381}
2382
2383static __inline__ __m128i __DEFAULT_FN_ATTRS
2384_mm_move_epi64(__m128i __a)
2385{
2386  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
2387}
2388
2389static __inline__ __m128d __DEFAULT_FN_ATTRS
2390_mm_unpackhi_pd(__m128d __a, __m128d __b)
2391{
2392  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
2393}
2394
2395static __inline__ __m128d __DEFAULT_FN_ATTRS
2396_mm_unpacklo_pd(__m128d __a, __m128d __b)
2397{
2398  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
2399}
2400
2401static __inline__ int __DEFAULT_FN_ATTRS
2402_mm_movemask_pd(__m128d __a)
2403{
2404  return __builtin_ia32_movmskpd((__v2df)__a);
2405}
2406
2407#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
2408  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
2409                                   0 + (((i) >> 0) & 0x1), \
2410                                   2 + (((i) >> 1) & 0x1)); })
2411
2412static __inline__ __m128 __DEFAULT_FN_ATTRS
2413_mm_castpd_ps(__m128d __a)
2414{
2415  return (__m128)__a;
2416}
2417
2418static __inline__ __m128i __DEFAULT_FN_ATTRS
2419_mm_castpd_si128(__m128d __a)
2420{
2421  return (__m128i)__a;
2422}
2423
2424static __inline__ __m128d __DEFAULT_FN_ATTRS
2425_mm_castps_pd(__m128 __a)
2426{
2427  return (__m128d)__a;
2428}
2429
2430static __inline__ __m128i __DEFAULT_FN_ATTRS
2431_mm_castps_si128(__m128 __a)
2432{
2433  return (__m128i)__a;
2434}
2435
2436static __inline__ __m128 __DEFAULT_FN_ATTRS
2437_mm_castsi128_ps(__m128i __a)
2438{
2439  return (__m128)__a;
2440}
2441
2442static __inline__ __m128d __DEFAULT_FN_ATTRS
2443_mm_castsi128_pd(__m128i __a)
2444{
2445  return (__m128d)__a;
2446}
2447
2448static __inline__ void __DEFAULT_FN_ATTRS
2449_mm_pause(void)
2450{
2451  __builtin_ia32_pause();
2452}
2453
2454#undef __DEFAULT_FN_ATTRS
2455
2456#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
2457
2458#endif /* __EMMINTRIN_H */
2459