emmintrin.h revision f42f85ce6c2c1ddbe57535898dfbe3a37f7199af
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d a, __m128d b)
44{
45  a[0] += b[0];
46  return a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d a, __m128d b)
51{
52  return a + b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d a, __m128d b)
57{
58  a[0] -= b[0];
59  return a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d a, __m128d b)
64{
65  return a - b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d a, __m128d b)
70{
71  a[0] *= b[0];
72  return a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d a, __m128d b)
77{
78  return a * b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d a, __m128d b)
83{
84  a[0] /= b[0];
85  return a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d a, __m128d b)
90{
91  return a / b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d a, __m128d b)
96{
97  __m128d c = __builtin_ia32_sqrtsd(b);
98  return (__m128d) { c[0], a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d a)
103{
104  return __builtin_ia32_sqrtpd(a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minsd(a, b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comige_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdge(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comineq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_comisdneq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomieq_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdeq(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomilt_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdlt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomile_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdle(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomigt_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdgt(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomige_sd(__m128d a, __m128d b)
361{
362  return __builtin_ia32_ucomisdge(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomineq_sd(__m128d a, __m128d b)
367{
368  return __builtin_ia32_ucomisdneq(a, b);
369}
370
371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
372_mm_cvtpd_ps(__m128d a)
373{
374  return __builtin_ia32_cvtpd2ps(a);
375}
376
377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
378_mm_cvtps_pd(__m128 a)
379{
380  return __builtin_ia32_cvtps2pd(a);
381}
382
383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
384_mm_cvtepi32_pd(__m128i a)
385{
386  return __builtin_ia32_cvtdq2pd((__v4si)a);
387}
388
389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
390_mm_cvtpd_epi32(__m128d a)
391{
392  return __builtin_ia32_cvtpd2dq(a);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtsd_si32(__m128d a)
397{
398  return __builtin_ia32_cvtsd2si(a);
399}
400
401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
402_mm_cvtsd_ss(__m128 a, __m128d b)
403{
404  a[0] = b[0];
405  return a;
406}
407
408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
409_mm_cvtsi32_sd(__m128d a, int b)
410{
411  a[0] = b;
412  return a;
413}
414
415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cvtss_sd(__m128d a, __m128 b)
417{
418  a[0] = b[0];
419  return a;
420}
421
422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
423_mm_cvttpd_epi32(__m128d a)
424{
425  return (__m128i)__builtin_ia32_cvttpd2dq(a);
426}
427
428static __inline__ int __attribute__((__always_inline__, __nodebug__))
429_mm_cvttsd_si32(__m128d a)
430{
431  return a[0];
432}
433
434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpd_pi32(__m128d a)
436{
437  return (__m64)__builtin_ia32_cvtpd2pi(a);
438}
439
440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
441_mm_cvttpd_pi32(__m128d a)
442{
443  return (__m64)__builtin_ia32_cvttpd2pi(a);
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_cvtpi32_pd(__m64 a)
448{
449  return __builtin_ia32_cvtpi2pd((__v2si)a);
450}
451
452static __inline__ double __attribute__((__always_inline__, __nodebug__))
453_mm_cvtsd_f64(__m128d a)
454{
455  return a[0];
456}
457
458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
459_mm_load_pd(double const *dp)
460{
461  return *(__m128d*)dp;
462}
463
464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
465_mm_load1_pd(double const *dp)
466{
467  struct __mm_load1_pd_struct {
468    double u;
469  } __attribute__((__packed__, __may_alias__));
470  double u = ((struct __mm_load1_pd_struct*)dp)->u;
471  return (__m128d){ u, u };
472}
473
474#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
475
476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
477_mm_loadr_pd(double const *dp)
478{
479  __m128d u = *(__m128d*)dp;
480  return __builtin_shufflevector(u, u, 1, 0);
481}
482
483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
484_mm_loadu_pd(double const *dp)
485{
486  struct __loadu_pd {
487    __m128d v;
488  } __attribute__((packed, may_alias));
489  return ((struct __loadu_pd*)dp)->v;
490}
491
492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
493_mm_load_sd(double const *dp)
494{
495  struct __mm_load_sd_struct {
496    double u;
497  } __attribute__((__packed__, __may_alias__));
498  double u = ((struct __mm_load_sd_struct*)dp)->u;
499  return (__m128d){ u, 0 };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_loadh_pd(__m128d a, double const *dp)
504{
505  struct __mm_loadh_pd_struct {
506    double u;
507  } __attribute__((__packed__, __may_alias__));
508  double u = ((struct __mm_loadh_pd_struct*)dp)->u;
509  return (__m128d){ a[0], u };
510}
511
512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
513_mm_loadl_pd(__m128d a, double const *dp)
514{
515  struct __mm_loadl_pd_struct {
516    double u;
517  } __attribute__((__packed__, __may_alias__));
518  double u = ((struct __mm_loadl_pd_struct*)dp)->u;
519  return (__m128d){ u, a[1] };
520}
521
522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
523_mm_set_sd(double w)
524{
525  return (__m128d){ w, 0 };
526}
527
528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
529_mm_set1_pd(double w)
530{
531  return (__m128d){ w, w };
532}
533
534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
535_mm_set_pd(double w, double x)
536{
537  return (__m128d){ x, w };
538}
539
540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
541_mm_setr_pd(double w, double x)
542{
543  return (__m128d){ w, x };
544}
545
546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
547_mm_setzero_pd(void)
548{
549  return (__m128d){ 0, 0 };
550}
551
552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
553_mm_move_sd(__m128d a, __m128d b)
554{
555  return (__m128d){ b[0], a[1] };
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_store_sd(double *dp, __m128d a)
560{
561  struct __mm_store_sd_struct {
562    double u;
563  } __attribute__((__packed__, __may_alias__));
564  ((struct __mm_store_sd_struct*)dp)->u = a[0];
565}
566
567static __inline__ void __attribute__((__always_inline__, __nodebug__))
568_mm_store1_pd(double *dp, __m128d a)
569{
570  struct __mm_store1_pd_struct {
571    double u[2];
572  } __attribute__((__packed__, __may_alias__));
573  ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
574  ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
575}
576
577static __inline__ void __attribute__((__always_inline__, __nodebug__))
578_mm_store_pd(double *dp, __m128d a)
579{
580  *(__m128d *)dp = a;
581}
582
583static __inline__ void __attribute__((__always_inline__, __nodebug__))
584_mm_storeu_pd(double *dp, __m128d a)
585{
586  __builtin_ia32_storeupd(dp, a);
587}
588
589static __inline__ void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_pd(double *dp, __m128d a)
591{
592  a = __builtin_shufflevector(a, a, 1, 0);
593  *(__m128d *)dp = a;
594}
595
596static __inline__ void __attribute__((__always_inline__, __nodebug__))
597_mm_storeh_pd(double *dp, __m128d a)
598{
599  struct __mm_storeh_pd_struct {
600    double u;
601  } __attribute__((__packed__, __may_alias__));
602  ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
603}
604
605static __inline__ void __attribute__((__always_inline__, __nodebug__))
606_mm_storel_pd(double *dp, __m128d a)
607{
608  struct __mm_storeh_pd_struct {
609    double u;
610  } __attribute__((__packed__, __may_alias__));
611  ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
612}
613
614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
615_mm_add_epi8(__m128i a, __m128i b)
616{
617  return (__m128i)((__v16qi)a + (__v16qi)b);
618}
619
620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
621_mm_add_epi16(__m128i a, __m128i b)
622{
623  return (__m128i)((__v8hi)a + (__v8hi)b);
624}
625
626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
627_mm_add_epi32(__m128i a, __m128i b)
628{
629  return (__m128i)((__v4si)a + (__v4si)b);
630}
631
632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_add_si64(__m64 a, __m64 b)
634{
635  return a + b;
636}
637
638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
639_mm_add_epi64(__m128i a, __m128i b)
640{
641  return a + b;
642}
643
644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
645_mm_adds_epi8(__m128i a, __m128i b)
646{
647  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
648}
649
650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
651_mm_adds_epi16(__m128i a, __m128i b)
652{
653  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
654}
655
656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
657_mm_adds_epu8(__m128i a, __m128i b)
658{
659  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
660}
661
662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
663_mm_adds_epu16(__m128i a, __m128i b)
664{
665  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
666}
667
668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
669_mm_avg_epu8(__m128i a, __m128i b)
670{
671  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
672}
673
674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
675_mm_avg_epu16(__m128i a, __m128i b)
676{
677  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
678}
679
680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
681_mm_madd_epi16(__m128i a, __m128i b)
682{
683  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
684}
685
686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
687_mm_max_epi16(__m128i a, __m128i b)
688{
689  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
690}
691
692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693_mm_max_epu8(__m128i a, __m128i b)
694{
695  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
696}
697
698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
699_mm_min_epi16(__m128i a, __m128i b)
700{
701  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
702}
703
704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
705_mm_min_epu8(__m128i a, __m128i b)
706{
707  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
708}
709
710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
711_mm_mulhi_epi16(__m128i a, __m128i b)
712{
713  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
714}
715
716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
717_mm_mulhi_epu16(__m128i a, __m128i b)
718{
719  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
720}
721
722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
723_mm_mullo_epi16(__m128i a, __m128i b)
724{
725  return (__m128i)((__v8hi)a * (__v8hi)b);
726}
727
728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
729_mm_mul_su32(__m64 a, __m64 b)
730{
731  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
732}
733
734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
735_mm_mul_epu32(__m128i a, __m128i b)
736{
737  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
738}
739
740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
741_mm_sad_epu8(__m128i a, __m128i b)
742{
743  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
744}
745
746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
747_mm_sub_epi8(__m128i a, __m128i b)
748{
749  return (__m128i)((__v16qi)a - (__v16qi)b);
750}
751
752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
753_mm_sub_epi16(__m128i a, __m128i b)
754{
755  return (__m128i)((__v8hi)a - (__v8hi)b);
756}
757
758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
759_mm_sub_epi32(__m128i a, __m128i b)
760{
761  return (__m128i)((__v4si)a - (__v4si)b);
762}
763
764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
765_mm_sub_si64(__m64 a, __m64 b)
766{
767  return a - b;
768}
769
770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
771_mm_sub_epi64(__m128i a, __m128i b)
772{
773  return a - b;
774}
775
776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
777_mm_subs_epi8(__m128i a, __m128i b)
778{
779  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
780}
781
782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
783_mm_subs_epi16(__m128i a, __m128i b)
784{
785  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
786}
787
788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
789_mm_subs_epu8(__m128i a, __m128i b)
790{
791  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
792}
793
794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
795_mm_subs_epu16(__m128i a, __m128i b)
796{
797  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
798}
799
800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
801_mm_and_si128(__m128i a, __m128i b)
802{
803  return a & b;
804}
805
806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
807_mm_andnot_si128(__m128i a, __m128i b)
808{
809  return ~a & b;
810}
811
812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
813_mm_or_si128(__m128i a, __m128i b)
814{
815  return a | b;
816}
817
818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
819_mm_xor_si128(__m128i a, __m128i b)
820{
821  return a ^ b;
822}
823
824#define _mm_slli_si128(a, count) __extension__ ({ \
825  __m128i __a = (a); \
826  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
827
828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829_mm_slli_epi16(__m128i a, int count)
830{
831  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
832}
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_sll_epi16(__m128i a, __m128i count)
836{
837  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_slli_epi32(__m128i a, int count)
842{
843  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
844}
845
846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847_mm_sll_epi32(__m128i a, __m128i count)
848{
849  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
850}
851
852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853_mm_slli_epi64(__m128i a, int count)
854{
855  return __builtin_ia32_psllqi128(a, count);
856}
857
858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859_mm_sll_epi64(__m128i a, __m128i count)
860{
861  return __builtin_ia32_psllq128(a, count);
862}
863
864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865_mm_srai_epi16(__m128i a, int count)
866{
867  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
868}
869
870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871_mm_sra_epi16(__m128i a, __m128i count)
872{
873  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
874}
875
876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877_mm_srai_epi32(__m128i a, int count)
878{
879  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
880}
881
882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883_mm_sra_epi32(__m128i a, __m128i count)
884{
885  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
886}
887
888
889#define _mm_srli_si128(a, count) __extension__ ({ \
890  __m128i __a = (a); \
891  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
892
893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
894_mm_srli_epi16(__m128i a, int count)
895{
896  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
897}
898
899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
900_mm_srl_epi16(__m128i a, __m128i count)
901{
902  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
903}
904
905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
906_mm_srli_epi32(__m128i a, int count)
907{
908  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
909}
910
911static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
912_mm_srl_epi32(__m128i a, __m128i count)
913{
914  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
915}
916
917static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
918_mm_srli_epi64(__m128i a, int count)
919{
920  return __builtin_ia32_psrlqi128(a, count);
921}
922
923static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
924_mm_srl_epi64(__m128i a, __m128i count)
925{
926  return __builtin_ia32_psrlq128(a, count);
927}
928
929static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
930_mm_cmpeq_epi8(__m128i a, __m128i b)
931{
932  return (__m128i)((__v16qi)a == (__v16qi)b);
933}
934
935static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
936_mm_cmpeq_epi16(__m128i a, __m128i b)
937{
938  return (__m128i)((__v8hi)a == (__v8hi)b);
939}
940
941static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
942_mm_cmpeq_epi32(__m128i a, __m128i b)
943{
944  return (__m128i)((__v4si)a == (__v4si)b);
945}
946
947static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
948_mm_cmpgt_epi8(__m128i a, __m128i b)
949{
950  typedef signed char __v16qs __attribute__((__vector_size__(16)));
951  return (__m128i)((__v16qs)a > (__v16qs)b);
952}
953
954static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
955_mm_cmpgt_epi16(__m128i a, __m128i b)
956{
957  return (__m128i)((__v8hi)a > (__v8hi)b);
958}
959
960static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
961_mm_cmpgt_epi32(__m128i a, __m128i b)
962{
963  return (__m128i)((__v4si)a > (__v4si)b);
964}
965
966static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
967_mm_cmplt_epi8(__m128i a, __m128i b)
968{
969  return _mm_cmpgt_epi8(b,a);
970}
971
972static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
973_mm_cmplt_epi16(__m128i a, __m128i b)
974{
975  return _mm_cmpgt_epi16(b,a);
976}
977
978static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
979_mm_cmplt_epi32(__m128i a, __m128i b)
980{
981  return _mm_cmpgt_epi32(b,a);
982}
983
984#ifdef __x86_64__
985static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
986_mm_cvtsi64_sd(__m128d a, long long b)
987{
988  a[0] = b;
989  return a;
990}
991
992static __inline__ long long __attribute__((__always_inline__, __nodebug__))
993_mm_cvtsd_si64(__m128d a)
994{
995  return __builtin_ia32_cvtsd2si64(a);
996}
997
998static __inline__ long long __attribute__((__always_inline__, __nodebug__))
999_mm_cvttsd_si64(__m128d a)
1000{
1001  return a[0];
1002}
1003#endif
1004
1005static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1006_mm_cvtepi32_ps(__m128i a)
1007{
1008  return __builtin_ia32_cvtdq2ps((__v4si)a);
1009}
1010
1011static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1012_mm_cvtps_epi32(__m128 a)
1013{
1014  return (__m128i)__builtin_ia32_cvtps2dq(a);
1015}
1016
1017static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1018_mm_cvttps_epi32(__m128 a)
1019{
1020  return (__m128i)__builtin_ia32_cvttps2dq(a);
1021}
1022
1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1024_mm_cvtsi32_si128(int a)
1025{
1026  return (__m128i)(__v4si){ a, 0, 0, 0 };
1027}
1028
1029#ifdef __x86_64__
1030static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1031_mm_cvtsi64_si128(long long a)
1032{
1033  return (__m128i){ a, 0 };
1034}
1035#endif
1036
1037static __inline__ int __attribute__((__always_inline__, __nodebug__))
1038_mm_cvtsi128_si32(__m128i a)
1039{
1040  __v4si b = (__v4si)a;
1041  return b[0];
1042}
1043
1044#ifdef __x86_64__
1045static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1046_mm_cvtsi128_si64(__m128i a)
1047{
1048  return a[0];
1049}
1050#endif
1051
1052static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1053_mm_load_si128(__m128i const *p)
1054{
1055  return *p;
1056}
1057
1058static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1059_mm_loadu_si128(__m128i const *p)
1060{
1061  struct __loadu_si128 {
1062    __m128i v;
1063  } __attribute__((packed, may_alias));
1064  return ((struct __loadu_si128*)p)->v;
1065}
1066
1067static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1068_mm_loadl_epi64(__m128i const *p)
1069{
1070  struct __mm_loadl_epi64_struct {
1071    long long u;
1072  } __attribute__((__packed__, __may_alias__));
1073  return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
1074}
1075
1076static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1077_mm_set_epi64x(long long q1, long long q0)
1078{
1079  return (__m128i){ q0, q1 };
1080}
1081
1082static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1083_mm_set_epi64(__m64 q1, __m64 q0)
1084{
1085  return (__m128i){ (long long)q0, (long long)q1 };
1086}
1087
1088static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1089_mm_set_epi32(int i3, int i2, int i1, int i0)
1090{
1091  return (__m128i)(__v4si){ i0, i1, i2, i3};
1092}
1093
1094static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1095_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1096{
1097  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1098}
1099
1100static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1101_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1102{
1103  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1104}
1105
1106static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1107_mm_set1_epi64x(long long q)
1108{
1109  return (__m128i){ q, q };
1110}
1111
1112static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1113_mm_set1_epi64(__m64 q)
1114{
1115  return (__m128i){ (long long)q, (long long)q };
1116}
1117
1118static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1119_mm_set1_epi32(int i)
1120{
1121  return (__m128i)(__v4si){ i, i, i, i };
1122}
1123
1124static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1125_mm_set1_epi16(short w)
1126{
1127  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1128}
1129
1130static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1131_mm_set1_epi8(char b)
1132{
1133  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1134}
1135
1136static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1137_mm_setr_epi64(__m64 q0, __m64 q1)
1138{
1139  return (__m128i){ (long long)q0, (long long)q1 };
1140}
1141
1142static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1143_mm_setr_epi32(int i0, int i1, int i2, int i3)
1144{
1145  return (__m128i)(__v4si){ i0, i1, i2, i3};
1146}
1147
1148static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1149_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1150{
1151  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1152}
1153
1154static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1155_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1156{
1157  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1158}
1159
1160static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1161_mm_setzero_si128(void)
1162{
1163  return (__m128i){ 0LL, 0LL };
1164}
1165
1166static __inline__ void __attribute__((__always_inline__, __nodebug__))
1167_mm_store_si128(__m128i *p, __m128i b)
1168{
1169  *p = b;
1170}
1171
1172static __inline__ void __attribute__((__always_inline__, __nodebug__))
1173_mm_storeu_si128(__m128i *p, __m128i b)
1174{
1175  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1176}
1177
1178static __inline__ void __attribute__((__always_inline__, __nodebug__))
1179_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1180{
1181  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1182}
1183
1184static __inline__ void __attribute__((__always_inline__, __nodebug__))
1185_mm_storel_epi64(__m128i *p, __m128i a)
1186{
1187  __builtin_ia32_storelv4si((__v2si *)p, a);
1188}
1189
1190static __inline__ void __attribute__((__always_inline__, __nodebug__))
1191_mm_stream_pd(double *p, __m128d a)
1192{
1193  __builtin_ia32_movntpd(p, a);
1194}
1195
1196static __inline__ void __attribute__((__always_inline__, __nodebug__))
1197_mm_stream_si128(__m128i *p, __m128i a)
1198{
1199  __builtin_ia32_movntdq(p, a);
1200}
1201
1202static __inline__ void __attribute__((__always_inline__, __nodebug__))
1203_mm_stream_si32(int *p, int a)
1204{
1205  __builtin_ia32_movnti(p, a);
1206}
1207
1208static __inline__ void __attribute__((__always_inline__, __nodebug__))
1209_mm_clflush(void const *p)
1210{
1211  __builtin_ia32_clflush(p);
1212}
1213
1214static __inline__ void __attribute__((__always_inline__, __nodebug__))
1215_mm_lfence(void)
1216{
1217  __builtin_ia32_lfence();
1218}
1219
1220static __inline__ void __attribute__((__always_inline__, __nodebug__))
1221_mm_mfence(void)
1222{
1223  __builtin_ia32_mfence();
1224}
1225
1226static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1227_mm_packs_epi16(__m128i a, __m128i b)
1228{
1229  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1230}
1231
1232static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1233_mm_packs_epi32(__m128i a, __m128i b)
1234{
1235  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1236}
1237
1238static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1239_mm_packus_epi16(__m128i a, __m128i b)
1240{
1241  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1242}
1243
1244static __inline__ int __attribute__((__always_inline__, __nodebug__))
1245_mm_extract_epi16(__m128i a, int imm)
1246{
1247  __v8hi b = (__v8hi)a;
1248  return (unsigned short)b[imm];
1249}
1250
1251static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1252_mm_insert_epi16(__m128i a, int b, int imm)
1253{
1254  __v8hi c = (__v8hi)a;
1255  c[imm & 7] = b;
1256  return (__m128i)c;
1257}
1258
1259static __inline__ int __attribute__((__always_inline__, __nodebug__))
1260_mm_movemask_epi8(__m128i a)
1261{
1262  return __builtin_ia32_pmovmskb128((__v16qi)a);
1263}
1264
1265#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1266  __m128i __a = (a); \
1267  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1268                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1269                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1270
1271#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1272  __m128i __a = (a); \
1273  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1274                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1275                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1276                                   4, 5, 6, 7); })
1277
1278#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1279  __m128i __a = (a); \
1280  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1281                                   0, 1, 2, 3, \
1282                                   4 + (((imm) & 0x03) >> 0), \
1283                                   4 + (((imm) & 0x0c) >> 2), \
1284                                   4 + (((imm) & 0x30) >> 4), \
1285                                   4 + (((imm) & 0xc0) >> 6)); })
1286
1287static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1288_mm_unpackhi_epi8(__m128i a, __m128i b)
1289{
1290  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1291}
1292
1293static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1294_mm_unpackhi_epi16(__m128i a, __m128i b)
1295{
1296  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1297}
1298
1299static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1300_mm_unpackhi_epi32(__m128i a, __m128i b)
1301{
1302  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1303}
1304
1305static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1306_mm_unpackhi_epi64(__m128i a, __m128i b)
1307{
1308  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1309}
1310
1311static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1312_mm_unpacklo_epi8(__m128i a, __m128i b)
1313{
1314  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1315}
1316
1317static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1318_mm_unpacklo_epi16(__m128i a, __m128i b)
1319{
1320  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1321}
1322
1323static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1324_mm_unpacklo_epi32(__m128i a, __m128i b)
1325{
1326  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1327}
1328
1329static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1330_mm_unpacklo_epi64(__m128i a, __m128i b)
1331{
1332  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1333}
1334
1335static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1336_mm_movepi64_pi64(__m128i a)
1337{
1338  return (__m64)a[0];
1339}
1340
1341static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1342_mm_movpi64_pi64(__m64 a)
1343{
1344  return (__m128i){ (long long)a, 0 };
1345}
1346
1347static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1348_mm_move_epi64(__m128i a)
1349{
1350  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1351}
1352
1353static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1354_mm_unpackhi_pd(__m128d a, __m128d b)
1355{
1356  return __builtin_shufflevector(a, b, 1, 2+1);
1357}
1358
1359static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1360_mm_unpacklo_pd(__m128d a, __m128d b)
1361{
1362  return __builtin_shufflevector(a, b, 0, 2+0);
1363}
1364
1365static __inline__ int __attribute__((__always_inline__, __nodebug__))
1366_mm_movemask_pd(__m128d a)
1367{
1368  return __builtin_ia32_movmskpd(a);
1369}
1370
1371#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1372  __m128d __a = (a); \
1373  __m128d __b = (b); \
1374  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1375
1376static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1377_mm_castpd_ps(__m128d in)
1378{
1379  return (__m128)in;
1380}
1381
1382static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1383_mm_castpd_si128(__m128d in)
1384{
1385  return (__m128i)in;
1386}
1387
1388static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1389_mm_castps_pd(__m128 in)
1390{
1391  return (__m128d)in;
1392}
1393
1394static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1395_mm_castps_si128(__m128 in)
1396{
1397  return (__m128i)in;
1398}
1399
1400static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1401_mm_castsi128_ps(__m128i in)
1402{
1403  return (__m128)in;
1404}
1405
1406static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1407_mm_castsi128_pd(__m128i in)
1408{
1409  return (__m128d)in;
1410}
1411
1412static __inline__ void __attribute__((__always_inline__, __nodebug__))
1413_mm_pause(void)
1414{
1415  __asm__ volatile ("pause");
1416}
1417
1418#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1419
1420#endif /* __SSE2__ */
1421
1422#endif /* __EMMINTRIN_H */
1423