1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d a, __m128d b)
44{
45  a[0] += b[0];
46  return a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d a, __m128d b)
51{
52  return a + b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d a, __m128d b)
57{
58  a[0] -= b[0];
59  return a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d a, __m128d b)
64{
65  return a - b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d a, __m128d b)
70{
71  a[0] *= b[0];
72  return a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d a, __m128d b)
77{
78  return a * b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d a, __m128d b)
83{
84  a[0] /= b[0];
85  return a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d a, __m128d b)
90{
91  return a / b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d a, __m128d b)
96{
97  __m128d c = __builtin_ia32_sqrtsd(b);
98  return (__m128d) { c[0], a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d a)
103{
104  return __builtin_ia32_sqrtpd(a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minsd(a, b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comige_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdge(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comineq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_comisdneq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomieq_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdeq(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomilt_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdlt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomile_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdle(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomigt_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdgt(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomige_sd(__m128d a, __m128d b)
361{
362  return __builtin_ia32_ucomisdge(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomineq_sd(__m128d a, __m128d b)
367{
368  return __builtin_ia32_ucomisdneq(a, b);
369}
370
371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
372_mm_cvtpd_ps(__m128d a)
373{
374  return __builtin_ia32_cvtpd2ps(a);
375}
376
377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
378_mm_cvtps_pd(__m128 a)
379{
380  return __builtin_ia32_cvtps2pd(a);
381}
382
383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
384_mm_cvtepi32_pd(__m128i a)
385{
386  return __builtin_ia32_cvtdq2pd((__v4si)a);
387}
388
389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
390_mm_cvtpd_epi32(__m128d a)
391{
392  return __builtin_ia32_cvtpd2dq(a);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtsd_si32(__m128d a)
397{
398  return __builtin_ia32_cvtsd2si(a);
399}
400
401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
402_mm_cvtsd_ss(__m128 a, __m128d b)
403{
404  a[0] = b[0];
405  return a;
406}
407
408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
409_mm_cvtsi32_sd(__m128d a, int b)
410{
411  a[0] = b;
412  return a;
413}
414
415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cvtss_sd(__m128d a, __m128 b)
417{
418  a[0] = b[0];
419  return a;
420}
421
422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
423_mm_cvttpd_epi32(__m128d a)
424{
425  return (__m128i)__builtin_ia32_cvttpd2dq(a);
426}
427
428static __inline__ int __attribute__((__always_inline__, __nodebug__))
429_mm_cvttsd_si32(__m128d a)
430{
431  return a[0];
432}
433
434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpd_pi32(__m128d a)
436{
437  return (__m64)__builtin_ia32_cvtpd2pi(a);
438}
439
440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
441_mm_cvttpd_pi32(__m128d a)
442{
443  return (__m64)__builtin_ia32_cvttpd2pi(a);
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_cvtpi32_pd(__m64 a)
448{
449  return __builtin_ia32_cvtpi2pd((__v2si)a);
450}
451
452static __inline__ double __attribute__((__always_inline__, __nodebug__))
453_mm_cvtsd_f64(__m128d a)
454{
455  return a[0];
456}
457
458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
459_mm_load_pd(double const *dp)
460{
461  return *(__m128d*)dp;
462}
463
464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
465_mm_load1_pd(double const *dp)
466{
467  struct __mm_load1_pd_struct {
468    double u;
469  } __attribute__((__packed__, __may_alias__));
470  double u = ((struct __mm_load1_pd_struct*)dp)->u;
471  return (__m128d){ u, u };
472}
473
474#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
475
476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
477_mm_loadr_pd(double const *dp)
478{
479  __m128d u = *(__m128d*)dp;
480  return __builtin_shufflevector(u, u, 1, 0);
481}
482
483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
484_mm_loadu_pd(double const *dp)
485{
486  struct __loadu_pd {
487    __m128d v;
488  } __attribute__((packed, may_alias));
489  return ((struct __loadu_pd*)dp)->v;
490}
491
492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
493_mm_load_sd(double const *dp)
494{
495  struct __mm_load_sd_struct {
496    double u;
497  } __attribute__((__packed__, __may_alias__));
498  double u = ((struct __mm_load_sd_struct*)dp)->u;
499  return (__m128d){ u, 0 };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_loadh_pd(__m128d a, double const *dp)
504{
505  struct __mm_loadh_pd_struct {
506    double u;
507  } __attribute__((__packed__, __may_alias__));
508  double u = ((struct __mm_loadh_pd_struct*)dp)->u;
509  return (__m128d){ a[0], u };
510}
511
512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
513_mm_loadl_pd(__m128d a, double const *dp)
514{
515  struct __mm_loadl_pd_struct {
516    double u;
517  } __attribute__((__packed__, __may_alias__));
518  double u = ((struct __mm_loadl_pd_struct*)dp)->u;
519  return (__m128d){ u, a[1] };
520}
521
522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
523_mm_set_sd(double w)
524{
525  return (__m128d){ w, 0 };
526}
527
528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
529_mm_set1_pd(double w)
530{
531  return (__m128d){ w, w };
532}
533
534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
535_mm_set_pd(double w, double x)
536{
537  return (__m128d){ x, w };
538}
539
540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
541_mm_setr_pd(double w, double x)
542{
543  return (__m128d){ w, x };
544}
545
546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
547_mm_setzero_pd(void)
548{
549  return (__m128d){ 0, 0 };
550}
551
552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
553_mm_move_sd(__m128d a, __m128d b)
554{
555  return (__m128d){ b[0], a[1] };
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_store_sd(double *dp, __m128d a)
560{
561  struct __mm_store_sd_struct {
562    double u;
563  } __attribute__((__packed__, __may_alias__));
564  ((struct __mm_store_sd_struct*)dp)->u = a[0];
565}
566
567static __inline__ void __attribute__((__always_inline__, __nodebug__))
568_mm_store1_pd(double *dp, __m128d a)
569{
570  struct __mm_store1_pd_struct {
571    double u[2];
572  } __attribute__((__packed__, __may_alias__));
573  ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
574  ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
575}
576
577static __inline__ void __attribute__((__always_inline__, __nodebug__))
578_mm_store_pd(double *dp, __m128d a)
579{
580  *(__m128d *)dp = a;
581}
582
583static __inline__ void __attribute__((__always_inline__, __nodebug__))
584_mm_storeu_pd(double *dp, __m128d a)
585{
586  __builtin_ia32_storeupd(dp, a);
587}
588
589static __inline__ void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_pd(double *dp, __m128d a)
591{
592  a = __builtin_shufflevector(a, a, 1, 0);
593  *(__m128d *)dp = a;
594}
595
596static __inline__ void __attribute__((__always_inline__, __nodebug__))
597_mm_storeh_pd(double *dp, __m128d a)
598{
599  struct __mm_storeh_pd_struct {
600    double u;
601  } __attribute__((__packed__, __may_alias__));
602  ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
603}
604
605static __inline__ void __attribute__((__always_inline__, __nodebug__))
606_mm_storel_pd(double *dp, __m128d a)
607{
608  struct __mm_storeh_pd_struct {
609    double u;
610  } __attribute__((__packed__, __may_alias__));
611  ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
612}
613
614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
615_mm_add_epi8(__m128i a, __m128i b)
616{
617  return (__m128i)((__v16qi)a + (__v16qi)b);
618}
619
620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
621_mm_add_epi16(__m128i a, __m128i b)
622{
623  return (__m128i)((__v8hi)a + (__v8hi)b);
624}
625
626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
627_mm_add_epi32(__m128i a, __m128i b)
628{
629  return (__m128i)((__v4si)a + (__v4si)b);
630}
631
632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_add_si64(__m64 a, __m64 b)
634{
635  return a + b;
636}
637
638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
639_mm_add_epi64(__m128i a, __m128i b)
640{
641  return a + b;
642}
643
644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
645_mm_adds_epi8(__m128i a, __m128i b)
646{
647  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
648}
649
650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
651_mm_adds_epi16(__m128i a, __m128i b)
652{
653  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
654}
655
656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
657_mm_adds_epu8(__m128i a, __m128i b)
658{
659  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
660}
661
662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
663_mm_adds_epu16(__m128i a, __m128i b)
664{
665  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
666}
667
668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
669_mm_avg_epu8(__m128i a, __m128i b)
670{
671  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
672}
673
674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
675_mm_avg_epu16(__m128i a, __m128i b)
676{
677  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
678}
679
680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
681_mm_madd_epi16(__m128i a, __m128i b)
682{
683  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
684}
685
686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
687_mm_max_epi16(__m128i a, __m128i b)
688{
689  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
690}
691
692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693_mm_max_epu8(__m128i a, __m128i b)
694{
695  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
696}
697
698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
699_mm_min_epi16(__m128i a, __m128i b)
700{
701  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
702}
703
704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
705_mm_min_epu8(__m128i a, __m128i b)
706{
707  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
708}
709
710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
711_mm_mulhi_epi16(__m128i a, __m128i b)
712{
713  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
714}
715
716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
717_mm_mulhi_epu16(__m128i a, __m128i b)
718{
719  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
720}
721
722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
723_mm_mullo_epi16(__m128i a, __m128i b)
724{
725  return (__m128i)((__v8hi)a * (__v8hi)b);
726}
727
728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
729_mm_mul_su32(__m64 a, __m64 b)
730{
731  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
732}
733
734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
735_mm_mul_epu32(__m128i a, __m128i b)
736{
737  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
738}
739
740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
741_mm_sad_epu8(__m128i a, __m128i b)
742{
743  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
744}
745
746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
747_mm_sub_epi8(__m128i a, __m128i b)
748{
749  return (__m128i)((__v16qi)a - (__v16qi)b);
750}
751
752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
753_mm_sub_epi16(__m128i a, __m128i b)
754{
755  return (__m128i)((__v8hi)a - (__v8hi)b);
756}
757
758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
759_mm_sub_epi32(__m128i a, __m128i b)
760{
761  return (__m128i)((__v4si)a - (__v4si)b);
762}
763
764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
765_mm_sub_si64(__m64 a, __m64 b)
766{
767  return a - b;
768}
769
770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
771_mm_sub_epi64(__m128i a, __m128i b)
772{
773  return a - b;
774}
775
776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
777_mm_subs_epi8(__m128i a, __m128i b)
778{
779  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
780}
781
782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
783_mm_subs_epi16(__m128i a, __m128i b)
784{
785  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
786}
787
788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
789_mm_subs_epu8(__m128i a, __m128i b)
790{
791  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
792}
793
794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
795_mm_subs_epu16(__m128i a, __m128i b)
796{
797  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
798}
799
800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
801_mm_and_si128(__m128i a, __m128i b)
802{
803  return a & b;
804}
805
806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
807_mm_andnot_si128(__m128i a, __m128i b)
808{
809  return ~a & b;
810}
811
812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
813_mm_or_si128(__m128i a, __m128i b)
814{
815  return a | b;
816}
817
818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
819_mm_xor_si128(__m128i a, __m128i b)
820{
821  return a ^ b;
822}
823
824#define _mm_slli_si128(a, count) __extension__ ({ \
825  __m128i __a = (a); \
826  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
827
828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829_mm_slli_epi16(__m128i a, int count)
830{
831  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
832}
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_sll_epi16(__m128i a, __m128i count)
836{
837  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_slli_epi32(__m128i a, int count)
842{
843  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
844}
845
846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847_mm_sll_epi32(__m128i a, __m128i count)
848{
849  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
850}
851
852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853_mm_slli_epi64(__m128i a, int count)
854{
855  return __builtin_ia32_psllqi128(a, count);
856}
857
858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859_mm_sll_epi64(__m128i a, __m128i count)
860{
861  return __builtin_ia32_psllq128(a, count);
862}
863
864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865_mm_srai_epi16(__m128i a, int count)
866{
867  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
868}
869
870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871_mm_sra_epi16(__m128i a, __m128i count)
872{
873  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
874}
875
876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877_mm_srai_epi32(__m128i a, int count)
878{
879  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
880}
881
882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883_mm_sra_epi32(__m128i a, __m128i count)
884{
885  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
886}
887
888
889#define _mm_srli_si128(a, count) __extension__ ({ \
890  __m128i __a = (a); \
891  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
892
893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
894_mm_srli_epi16(__m128i a, int count)
895{
896  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
897}
898
899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
900_mm_srl_epi16(__m128i a, __m128i count)
901{
902  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
903}
904
905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
906_mm_srli_epi32(__m128i a, int count)
907{
908  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
909}
910
911static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
912_mm_srl_epi32(__m128i a, __m128i count)
913{
914  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
915}
916
917static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
918_mm_srli_epi64(__m128i a, int count)
919{
920  return __builtin_ia32_psrlqi128(a, count);
921}
922
923static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
924_mm_srl_epi64(__m128i a, __m128i count)
925{
926  return __builtin_ia32_psrlq128(a, count);
927}
928
929static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
930_mm_cmpeq_epi8(__m128i a, __m128i b)
931{
932  return (__m128i)((__v16qi)a == (__v16qi)b);
933}
934
935static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
936_mm_cmpeq_epi16(__m128i a, __m128i b)
937{
938  return (__m128i)((__v8hi)a == (__v8hi)b);
939}
940
941static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
942_mm_cmpeq_epi32(__m128i a, __m128i b)
943{
944  return (__m128i)((__v4si)a == (__v4si)b);
945}
946
947static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
948_mm_cmpgt_epi8(__m128i a, __m128i b)
949{
950  /* This function always performs a signed comparison, but __v16qi is a char
951     which may be signed or unsigned. */
952  typedef signed char __v16qs __attribute__((__vector_size__(16)));
953  return (__m128i)((__v16qs)a > (__v16qs)b);
954}
955
956static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
957_mm_cmpgt_epi16(__m128i a, __m128i b)
958{
959  return (__m128i)((__v8hi)a > (__v8hi)b);
960}
961
962static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
963_mm_cmpgt_epi32(__m128i a, __m128i b)
964{
965  return (__m128i)((__v4si)a > (__v4si)b);
966}
967
968static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
969_mm_cmplt_epi8(__m128i a, __m128i b)
970{
971  return _mm_cmpgt_epi8(b,a);
972}
973
974static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
975_mm_cmplt_epi16(__m128i a, __m128i b)
976{
977  return _mm_cmpgt_epi16(b,a);
978}
979
980static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
981_mm_cmplt_epi32(__m128i a, __m128i b)
982{
983  return _mm_cmpgt_epi32(b,a);
984}
985
986#ifdef __x86_64__
987static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
988_mm_cvtsi64_sd(__m128d a, long long b)
989{
990  a[0] = b;
991  return a;
992}
993
994static __inline__ long long __attribute__((__always_inline__, __nodebug__))
995_mm_cvtsd_si64(__m128d a)
996{
997  return __builtin_ia32_cvtsd2si64(a);
998}
999
1000static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1001_mm_cvttsd_si64(__m128d a)
1002{
1003  return a[0];
1004}
1005#endif
1006
1007static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1008_mm_cvtepi32_ps(__m128i a)
1009{
1010  return __builtin_ia32_cvtdq2ps((__v4si)a);
1011}
1012
1013static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1014_mm_cvtps_epi32(__m128 a)
1015{
1016  return (__m128i)__builtin_ia32_cvtps2dq(a);
1017}
1018
1019static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1020_mm_cvttps_epi32(__m128 a)
1021{
1022  return (__m128i)__builtin_ia32_cvttps2dq(a);
1023}
1024
1025static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1026_mm_cvtsi32_si128(int a)
1027{
1028  return (__m128i)(__v4si){ a, 0, 0, 0 };
1029}
1030
1031#ifdef __x86_64__
1032static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1033_mm_cvtsi64_si128(long long a)
1034{
1035  return (__m128i){ a, 0 };
1036}
1037#endif
1038
1039static __inline__ int __attribute__((__always_inline__, __nodebug__))
1040_mm_cvtsi128_si32(__m128i a)
1041{
1042  __v4si b = (__v4si)a;
1043  return b[0];
1044}
1045
1046#ifdef __x86_64__
1047static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1048_mm_cvtsi128_si64(__m128i a)
1049{
1050  return a[0];
1051}
1052#endif
1053
1054static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1055_mm_load_si128(__m128i const *p)
1056{
1057  return *p;
1058}
1059
1060static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1061_mm_loadu_si128(__m128i const *p)
1062{
1063  struct __loadu_si128 {
1064    __m128i v;
1065  } __attribute__((packed, may_alias));
1066  return ((struct __loadu_si128*)p)->v;
1067}
1068
1069static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1070_mm_loadl_epi64(__m128i const *p)
1071{
1072  struct __mm_loadl_epi64_struct {
1073    long long u;
1074  } __attribute__((__packed__, __may_alias__));
1075  return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
1076}
1077
1078static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1079_mm_set_epi64x(long long q1, long long q0)
1080{
1081  return (__m128i){ q0, q1 };
1082}
1083
1084static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1085_mm_set_epi64(__m64 q1, __m64 q0)
1086{
1087  return (__m128i){ (long long)q0, (long long)q1 };
1088}
1089
1090static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1091_mm_set_epi32(int i3, int i2, int i1, int i0)
1092{
1093  return (__m128i)(__v4si){ i0, i1, i2, i3};
1094}
1095
1096static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1097_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1098{
1099  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1100}
1101
1102static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1103_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1104{
1105  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1106}
1107
1108static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1109_mm_set1_epi64x(long long q)
1110{
1111  return (__m128i){ q, q };
1112}
1113
1114static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1115_mm_set1_epi64(__m64 q)
1116{
1117  return (__m128i){ (long long)q, (long long)q };
1118}
1119
1120static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1121_mm_set1_epi32(int i)
1122{
1123  return (__m128i)(__v4si){ i, i, i, i };
1124}
1125
1126static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1127_mm_set1_epi16(short w)
1128{
1129  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1130}
1131
1132static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1133_mm_set1_epi8(char b)
1134{
1135  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1136}
1137
1138static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1139_mm_setr_epi64(__m64 q0, __m64 q1)
1140{
1141  return (__m128i){ (long long)q0, (long long)q1 };
1142}
1143
1144static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1145_mm_setr_epi32(int i0, int i1, int i2, int i3)
1146{
1147  return (__m128i)(__v4si){ i0, i1, i2, i3};
1148}
1149
1150static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1151_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1152{
1153  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1154}
1155
1156static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1157_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1158{
1159  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1160}
1161
1162static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1163_mm_setzero_si128(void)
1164{
1165  return (__m128i){ 0LL, 0LL };
1166}
1167
1168static __inline__ void __attribute__((__always_inline__, __nodebug__))
1169_mm_store_si128(__m128i *p, __m128i b)
1170{
1171  *p = b;
1172}
1173
1174static __inline__ void __attribute__((__always_inline__, __nodebug__))
1175_mm_storeu_si128(__m128i *p, __m128i b)
1176{
1177  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1178}
1179
1180static __inline__ void __attribute__((__always_inline__, __nodebug__))
1181_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1182{
1183  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1184}
1185
1186static __inline__ void __attribute__((__always_inline__, __nodebug__))
1187_mm_storel_epi64(__m128i *p, __m128i a)
1188{
1189  struct __mm_storel_epi64_struct {
1190    long long u;
1191  } __attribute__((__packed__, __may_alias__));
1192  ((struct __mm_storel_epi64_struct*)p)->u = a[0];
1193}
1194
1195static __inline__ void __attribute__((__always_inline__, __nodebug__))
1196_mm_stream_pd(double *p, __m128d a)
1197{
1198  __builtin_ia32_movntpd(p, a);
1199}
1200
1201static __inline__ void __attribute__((__always_inline__, __nodebug__))
1202_mm_stream_si128(__m128i *p, __m128i a)
1203{
1204  __builtin_ia32_movntdq(p, a);
1205}
1206
1207static __inline__ void __attribute__((__always_inline__, __nodebug__))
1208_mm_stream_si32(int *p, int a)
1209{
1210  __builtin_ia32_movnti(p, a);
1211}
1212
1213static __inline__ void __attribute__((__always_inline__, __nodebug__))
1214_mm_clflush(void const *p)
1215{
1216  __builtin_ia32_clflush(p);
1217}
1218
1219static __inline__ void __attribute__((__always_inline__, __nodebug__))
1220_mm_lfence(void)
1221{
1222  __builtin_ia32_lfence();
1223}
1224
1225static __inline__ void __attribute__((__always_inline__, __nodebug__))
1226_mm_mfence(void)
1227{
1228  __builtin_ia32_mfence();
1229}
1230
1231static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1232_mm_packs_epi16(__m128i a, __m128i b)
1233{
1234  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1235}
1236
1237static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1238_mm_packs_epi32(__m128i a, __m128i b)
1239{
1240  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1241}
1242
1243static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1244_mm_packus_epi16(__m128i a, __m128i b)
1245{
1246  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1247}
1248
1249static __inline__ int __attribute__((__always_inline__, __nodebug__))
1250_mm_extract_epi16(__m128i a, int imm)
1251{
1252  __v8hi b = (__v8hi)a;
1253  return (unsigned short)b[imm];
1254}
1255
1256static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1257_mm_insert_epi16(__m128i a, int b, int imm)
1258{
1259  __v8hi c = (__v8hi)a;
1260  c[imm & 7] = b;
1261  return (__m128i)c;
1262}
1263
1264static __inline__ int __attribute__((__always_inline__, __nodebug__))
1265_mm_movemask_epi8(__m128i a)
1266{
1267  return __builtin_ia32_pmovmskb128((__v16qi)a);
1268}
1269
1270#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1271  __m128i __a = (a); \
1272  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1273                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1274                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1275
1276#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1277  __m128i __a = (a); \
1278  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1279                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1280                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1281                                   4, 5, 6, 7); })
1282
1283#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1284  __m128i __a = (a); \
1285  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1286                                   0, 1, 2, 3, \
1287                                   4 + (((imm) & 0x03) >> 0), \
1288                                   4 + (((imm) & 0x0c) >> 2), \
1289                                   4 + (((imm) & 0x30) >> 4), \
1290                                   4 + (((imm) & 0xc0) >> 6)); })
1291
1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1293_mm_unpackhi_epi8(__m128i a, __m128i b)
1294{
1295  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1296}
1297
1298static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1299_mm_unpackhi_epi16(__m128i a, __m128i b)
1300{
1301  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1302}
1303
1304static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1305_mm_unpackhi_epi32(__m128i a, __m128i b)
1306{
1307  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1308}
1309
1310static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1311_mm_unpackhi_epi64(__m128i a, __m128i b)
1312{
1313  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1314}
1315
1316static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1317_mm_unpacklo_epi8(__m128i a, __m128i b)
1318{
1319  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1320}
1321
1322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1323_mm_unpacklo_epi16(__m128i a, __m128i b)
1324{
1325  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1326}
1327
1328static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1329_mm_unpacklo_epi32(__m128i a, __m128i b)
1330{
1331  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1332}
1333
1334static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1335_mm_unpacklo_epi64(__m128i a, __m128i b)
1336{
1337  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1338}
1339
1340static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1341_mm_movepi64_pi64(__m128i a)
1342{
1343  return (__m64)a[0];
1344}
1345
1346static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1347_mm_movpi64_pi64(__m64 a)
1348{
1349  return (__m128i){ (long long)a, 0 };
1350}
1351
1352static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1353_mm_move_epi64(__m128i a)
1354{
1355  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1356}
1357
1358static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1359_mm_unpackhi_pd(__m128d a, __m128d b)
1360{
1361  return __builtin_shufflevector(a, b, 1, 2+1);
1362}
1363
1364static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1365_mm_unpacklo_pd(__m128d a, __m128d b)
1366{
1367  return __builtin_shufflevector(a, b, 0, 2+0);
1368}
1369
1370static __inline__ int __attribute__((__always_inline__, __nodebug__))
1371_mm_movemask_pd(__m128d a)
1372{
1373  return __builtin_ia32_movmskpd(a);
1374}
1375
1376#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1377  __m128d __a = (a); \
1378  __m128d __b = (b); \
1379  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1380
1381static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1382_mm_castpd_ps(__m128d in)
1383{
1384  return (__m128)in;
1385}
1386
1387static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1388_mm_castpd_si128(__m128d in)
1389{
1390  return (__m128i)in;
1391}
1392
1393static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1394_mm_castps_pd(__m128 in)
1395{
1396  return (__m128d)in;
1397}
1398
1399static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1400_mm_castps_si128(__m128 in)
1401{
1402  return (__m128i)in;
1403}
1404
1405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406_mm_castsi128_ps(__m128i in)
1407{
1408  return (__m128)in;
1409}
1410
1411static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1412_mm_castsi128_pd(__m128i in)
1413{
1414  return (__m128d)in;
1415}
1416
1417static __inline__ void __attribute__((__always_inline__, __nodebug__))
1418_mm_pause(void)
1419{
1420  __asm__ volatile ("pause");
1421}
1422
1423#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1424
1425#endif /* __SSE2__ */
1426
1427#endif /* __EMMINTRIN_H */
1428