1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d __a, __m128d __b)
44{
45  __a[0] += __b[0];
46  return __a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d __a, __m128d __b)
51{
52  return __a + __b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d __a, __m128d __b)
57{
58  __a[0] -= __b[0];
59  return __a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d __a, __m128d __b)
64{
65  return __a - __b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d __a, __m128d __b)
70{
71  __a[0] *= __b[0];
72  return __a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d __a, __m128d __b)
77{
78  return __a * __b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d __a, __m128d __b)
83{
84  __a[0] /= __b[0];
85  return __a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d __a, __m128d __b)
90{
91  return __a / __b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d __a, __m128d __b)
96{
97  __m128d __c = __builtin_ia32_sqrtsd(__b);
98  return (__m128d) { __c[0], __a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d __a)
103{
104  return __builtin_ia32_sqrtpd(__a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d __a, __m128d __b)
109{
110  return __builtin_ia32_minsd(__a, __b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d __a, __m128d __b)
115{
116  return __builtin_ia32_minpd(__a, __b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d __a, __m128d __b)
121{
122  return __builtin_ia32_maxsd(__a, __b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d __a, __m128d __b)
127{
128  return __builtin_ia32_maxpd(__a, __b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d __a, __m128d __b)
133{
134  return (__m128d)((__v4si)__a & (__v4si)__b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d __a, __m128d __b)
139{
140  return (__m128d)(~(__v4si)__a & (__v4si)__b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d __a, __m128d __b)
145{
146  return (__m128d)((__v4si)__a | (__v4si)__b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d __a, __m128d __b)
151{
152  return (__m128d)((__v4si)__a ^ (__v4si)__b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d __a, __m128d __b)
157{
158  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d __a, __m128d __b)
163{
164  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d __a, __m128d __b)
169{
170  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d __a, __m128d __b)
175{
176  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d __a, __m128d __b)
181{
182  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d __a, __m128d __b)
187{
188  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d __a, __m128d __b)
193{
194  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d __a, __m128d __b)
199{
200  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205{
206  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d __a, __m128d __b)
211{
212  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d __a, __m128d __b)
217{
218  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d __a, __m128d __b)
223{
224  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d __a, __m128d __b)
229{
230  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d __a, __m128d __b)
235{
236  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d __a, __m128d __b)
241{
242  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d __a, __m128d __b)
247{
248  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
249  return (__m128d) { __c[0], __a[1] };
250}
251
252static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
253_mm_cmpge_sd(__m128d __a, __m128d __b)
254{
255  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
256  return (__m128d) { __c[0], __a[1] };
257}
258
259static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260_mm_cmpord_sd(__m128d __a, __m128d __b)
261{
262  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
263}
264
265static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_cmpunord_sd(__m128d __a, __m128d __b)
267{
268  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
269}
270
271static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272_mm_cmpneq_sd(__m128d __a, __m128d __b)
273{
274  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
275}
276
277static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278_mm_cmpnlt_sd(__m128d __a, __m128d __b)
279{
280  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
281}
282
283static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284_mm_cmpnle_sd(__m128d __a, __m128d __b)
285{
286  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
287}
288
289static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
290_mm_cmpngt_sd(__m128d __a, __m128d __b)
291{
292  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
293  return (__m128d) { __c[0], __a[1] };
294}
295
296static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
297_mm_cmpnge_sd(__m128d __a, __m128d __b)
298{
299  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
300  return (__m128d) { __c[0], __a[1] };
301}
302
303static __inline__ int __attribute__((__always_inline__, __nodebug__))
304_mm_comieq_sd(__m128d __a, __m128d __b)
305{
306  return __builtin_ia32_comisdeq(__a, __b);
307}
308
309static __inline__ int __attribute__((__always_inline__, __nodebug__))
310_mm_comilt_sd(__m128d __a, __m128d __b)
311{
312  return __builtin_ia32_comisdlt(__a, __b);
313}
314
315static __inline__ int __attribute__((__always_inline__, __nodebug__))
316_mm_comile_sd(__m128d __a, __m128d __b)
317{
318  return __builtin_ia32_comisdle(__a, __b);
319}
320
321static __inline__ int __attribute__((__always_inline__, __nodebug__))
322_mm_comigt_sd(__m128d __a, __m128d __b)
323{
324  return __builtin_ia32_comisdgt(__a, __b);
325}
326
327static __inline__ int __attribute__((__always_inline__, __nodebug__))
328_mm_comige_sd(__m128d __a, __m128d __b)
329{
330  return __builtin_ia32_comisdge(__a, __b);
331}
332
333static __inline__ int __attribute__((__always_inline__, __nodebug__))
334_mm_comineq_sd(__m128d __a, __m128d __b)
335{
336  return __builtin_ia32_comisdneq(__a, __b);
337}
338
339static __inline__ int __attribute__((__always_inline__, __nodebug__))
340_mm_ucomieq_sd(__m128d __a, __m128d __b)
341{
342  return __builtin_ia32_ucomisdeq(__a, __b);
343}
344
345static __inline__ int __attribute__((__always_inline__, __nodebug__))
346_mm_ucomilt_sd(__m128d __a, __m128d __b)
347{
348  return __builtin_ia32_ucomisdlt(__a, __b);
349}
350
351static __inline__ int __attribute__((__always_inline__, __nodebug__))
352_mm_ucomile_sd(__m128d __a, __m128d __b)
353{
354  return __builtin_ia32_ucomisdle(__a, __b);
355}
356
357static __inline__ int __attribute__((__always_inline__, __nodebug__))
358_mm_ucomigt_sd(__m128d __a, __m128d __b)
359{
360  return __builtin_ia32_ucomisdgt(__a, __b);
361}
362
363static __inline__ int __attribute__((__always_inline__, __nodebug__))
364_mm_ucomige_sd(__m128d __a, __m128d __b)
365{
366  return __builtin_ia32_ucomisdge(__a, __b);
367}
368
369static __inline__ int __attribute__((__always_inline__, __nodebug__))
370_mm_ucomineq_sd(__m128d __a, __m128d __b)
371{
372  return __builtin_ia32_ucomisdneq(__a, __b);
373}
374
375static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
376_mm_cvtpd_ps(__m128d __a)
377{
378  return __builtin_ia32_cvtpd2ps(__a);
379}
380
381static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
382_mm_cvtps_pd(__m128 __a)
383{
384  return __builtin_ia32_cvtps2pd(__a);
385}
386
387static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
388_mm_cvtepi32_pd(__m128i __a)
389{
390  return __builtin_ia32_cvtdq2pd((__v4si)__a);
391}
392
393static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
394_mm_cvtpd_epi32(__m128d __a)
395{
396  return __builtin_ia32_cvtpd2dq(__a);
397}
398
399static __inline__ int __attribute__((__always_inline__, __nodebug__))
400_mm_cvtsd_si32(__m128d __a)
401{
402  return __builtin_ia32_cvtsd2si(__a);
403}
404
405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
406_mm_cvtsd_ss(__m128 __a, __m128d __b)
407{
408  __a[0] = __b[0];
409  return __a;
410}
411
412static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
413_mm_cvtsi32_sd(__m128d __a, int __b)
414{
415  __a[0] = __b;
416  return __a;
417}
418
419static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
420_mm_cvtss_sd(__m128d __a, __m128 __b)
421{
422  __a[0] = __b[0];
423  return __a;
424}
425
426static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
427_mm_cvttpd_epi32(__m128d __a)
428{
429  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430}
431
432static __inline__ int __attribute__((__always_inline__, __nodebug__))
433_mm_cvttsd_si32(__m128d __a)
434{
435  return __a[0];
436}
437
438static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
439_mm_cvtpd_pi32(__m128d __a)
440{
441  return (__m64)__builtin_ia32_cvtpd2pi(__a);
442}
443
444static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
445_mm_cvttpd_pi32(__m128d __a)
446{
447  return (__m64)__builtin_ia32_cvttpd2pi(__a);
448}
449
450static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
451_mm_cvtpi32_pd(__m64 __a)
452{
453  return __builtin_ia32_cvtpi2pd((__v2si)__a);
454}
455
456static __inline__ double __attribute__((__always_inline__, __nodebug__))
457_mm_cvtsd_f64(__m128d __a)
458{
459  return __a[0];
460}
461
462static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
463_mm_load_pd(double const *__dp)
464{
465  return *(__m128d*)__dp;
466}
467
468static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
469_mm_load1_pd(double const *__dp)
470{
471  struct __mm_load1_pd_struct {
472    double __u;
473  } __attribute__((__packed__, __may_alias__));
474  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475  return (__m128d){ __u, __u };
476}
477
478#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
479
480static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481_mm_loadr_pd(double const *__dp)
482{
483  __m128d __u = *(__m128d*)__dp;
484  return __builtin_shufflevector(__u, __u, 1, 0);
485}
486
487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_loadu_pd(double const *__dp)
489{
490  struct __loadu_pd {
491    __m128d __v;
492  } __attribute__((__packed__, __may_alias__));
493  return ((struct __loadu_pd*)__dp)->__v;
494}
495
496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497_mm_load_sd(double const *__dp)
498{
499  struct __mm_load_sd_struct {
500    double __u;
501  } __attribute__((__packed__, __may_alias__));
502  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503  return (__m128d){ __u, 0 };
504}
505
506static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
507_mm_loadh_pd(__m128d __a, double const *__dp)
508{
509  struct __mm_loadh_pd_struct {
510    double __u;
511  } __attribute__((__packed__, __may_alias__));
512  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513  return (__m128d){ __a[0], __u };
514}
515
516static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
517_mm_loadl_pd(__m128d __a, double const *__dp)
518{
519  struct __mm_loadl_pd_struct {
520    double __u;
521  } __attribute__((__packed__, __may_alias__));
522  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523  return (__m128d){ __u, __a[1] };
524}
525
526static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527_mm_set_sd(double __w)
528{
529  return (__m128d){ __w, 0 };
530}
531
532static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533_mm_set1_pd(double __w)
534{
535  return (__m128d){ __w, __w };
536}
537
538static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539_mm_set_pd(double __w, double __x)
540{
541  return (__m128d){ __x, __w };
542}
543
544static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
545_mm_setr_pd(double __w, double __x)
546{
547  return (__m128d){ __w, __x };
548}
549
550static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
551_mm_setzero_pd(void)
552{
553  return (__m128d){ 0, 0 };
554}
555
556static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
557_mm_move_sd(__m128d __a, __m128d __b)
558{
559  return (__m128d){ __b[0], __a[1] };
560}
561
562static __inline__ void __attribute__((__always_inline__, __nodebug__))
563_mm_store_sd(double *__dp, __m128d __a)
564{
565  struct __mm_store_sd_struct {
566    double __u;
567  } __attribute__((__packed__, __may_alias__));
568  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569}
570
571static __inline__ void __attribute__((__always_inline__, __nodebug__))
572_mm_store1_pd(double *__dp, __m128d __a)
573{
574  struct __mm_store1_pd_struct {
575    double __u[2];
576  } __attribute__((__packed__, __may_alias__));
577  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579}
580
581static __inline__ void __attribute__((__always_inline__, __nodebug__))
582_mm_store_pd(double *__dp, __m128d __a)
583{
584  *(__m128d *)__dp = __a;
585}
586
587static __inline__ void __attribute__((__always_inline__, __nodebug__))
588_mm_storeu_pd(double *__dp, __m128d __a)
589{
590  __builtin_ia32_storeupd(__dp, __a);
591}
592
593static __inline__ void __attribute__((__always_inline__, __nodebug__))
594_mm_storer_pd(double *__dp, __m128d __a)
595{
596  __a = __builtin_shufflevector(__a, __a, 1, 0);
597  *(__m128d *)__dp = __a;
598}
599
600static __inline__ void __attribute__((__always_inline__, __nodebug__))
601_mm_storeh_pd(double *__dp, __m128d __a)
602{
603  struct __mm_storeh_pd_struct {
604    double __u;
605  } __attribute__((__packed__, __may_alias__));
606  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607}
608
609static __inline__ void __attribute__((__always_inline__, __nodebug__))
610_mm_storel_pd(double *__dp, __m128d __a)
611{
612  struct __mm_storeh_pd_struct {
613    double __u;
614  } __attribute__((__packed__, __may_alias__));
615  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616}
617
618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619_mm_add_epi8(__m128i __a, __m128i __b)
620{
621  return (__m128i)((__v16qi)__a + (__v16qi)__b);
622}
623
624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625_mm_add_epi16(__m128i __a, __m128i __b)
626{
627  return (__m128i)((__v8hi)__a + (__v8hi)__b);
628}
629
630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631_mm_add_epi32(__m128i __a, __m128i __b)
632{
633  return (__m128i)((__v4si)__a + (__v4si)__b);
634}
635
636static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
637_mm_add_si64(__m64 __a, __m64 __b)
638{
639  return __a + __b;
640}
641
642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643_mm_add_epi64(__m128i __a, __m128i __b)
644{
645  return __a + __b;
646}
647
648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649_mm_adds_epi8(__m128i __a, __m128i __b)
650{
651  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652}
653
654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655_mm_adds_epi16(__m128i __a, __m128i __b)
656{
657  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658}
659
660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661_mm_adds_epu8(__m128i __a, __m128i __b)
662{
663  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664}
665
666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667_mm_adds_epu16(__m128i __a, __m128i __b)
668{
669  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670}
671
672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673_mm_avg_epu8(__m128i __a, __m128i __b)
674{
675  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676}
677
678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679_mm_avg_epu16(__m128i __a, __m128i __b)
680{
681  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682}
683
684static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685_mm_madd_epi16(__m128i __a, __m128i __b)
686{
687  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688}
689
690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691_mm_max_epi16(__m128i __a, __m128i __b)
692{
693  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694}
695
696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697_mm_max_epu8(__m128i __a, __m128i __b)
698{
699  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700}
701
702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703_mm_min_epi16(__m128i __a, __m128i __b)
704{
705  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706}
707
708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709_mm_min_epu8(__m128i __a, __m128i __b)
710{
711  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712}
713
714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715_mm_mulhi_epi16(__m128i __a, __m128i __b)
716{
717  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718}
719
720static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721_mm_mulhi_epu16(__m128i __a, __m128i __b)
722{
723  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724}
725
726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727_mm_mullo_epi16(__m128i __a, __m128i __b)
728{
729  return (__m128i)((__v8hi)__a * (__v8hi)__b);
730}
731
732static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733_mm_mul_su32(__m64 __a, __m64 __b)
734{
735  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736}
737
738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739_mm_mul_epu32(__m128i __a, __m128i __b)
740{
741  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742}
743
744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745_mm_sad_epu8(__m128i __a, __m128i __b)
746{
747  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748}
749
750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751_mm_sub_epi8(__m128i __a, __m128i __b)
752{
753  return (__m128i)((__v16qi)__a - (__v16qi)__b);
754}
755
756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757_mm_sub_epi16(__m128i __a, __m128i __b)
758{
759  return (__m128i)((__v8hi)__a - (__v8hi)__b);
760}
761
762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763_mm_sub_epi32(__m128i __a, __m128i __b)
764{
765  return (__m128i)((__v4si)__a - (__v4si)__b);
766}
767
768static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
769_mm_sub_si64(__m64 __a, __m64 __b)
770{
771  return __a - __b;
772}
773
774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775_mm_sub_epi64(__m128i __a, __m128i __b)
776{
777  return __a - __b;
778}
779
780static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781_mm_subs_epi8(__m128i __a, __m128i __b)
782{
783  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784}
785
786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787_mm_subs_epi16(__m128i __a, __m128i __b)
788{
789  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790}
791
792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793_mm_subs_epu8(__m128i __a, __m128i __b)
794{
795  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796}
797
798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799_mm_subs_epu16(__m128i __a, __m128i __b)
800{
801  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802}
803
804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805_mm_and_si128(__m128i __a, __m128i __b)
806{
807  return __a & __b;
808}
809
810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811_mm_andnot_si128(__m128i __a, __m128i __b)
812{
813  return ~__a & __b;
814}
815
816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817_mm_or_si128(__m128i __a, __m128i __b)
818{
819  return __a | __b;
820}
821
822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823_mm_xor_si128(__m128i __a, __m128i __b)
824{
825  return __a ^ __b;
826}
827
828#define _mm_slli_si128(a, imm) __extension__ ({                         \
829  (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
830                                   (__v16qi)(__m128i)(a),               \
831                                   ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
832                                   ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
833                                   ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
834                                   ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
835                                   ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
836                                   ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
837                                   ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
838                                   ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
839                                   ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
840                                   ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
841                                   ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
842                                   ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
843                                   ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
844                                   ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
845                                   ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
846                                   ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
847
848#define _mm_bslli_si128(a, imm) \
849  _mm_slli_si128((a), (imm))
850
851static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
852_mm_slli_epi16(__m128i __a, int __count)
853{
854  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
855}
856
857static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
858_mm_sll_epi16(__m128i __a, __m128i __count)
859{
860  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
861}
862
863static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
864_mm_slli_epi32(__m128i __a, int __count)
865{
866  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
867}
868
869static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
870_mm_sll_epi32(__m128i __a, __m128i __count)
871{
872  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
873}
874
875static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
876_mm_slli_epi64(__m128i __a, int __count)
877{
878  return __builtin_ia32_psllqi128(__a, __count);
879}
880
881static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
882_mm_sll_epi64(__m128i __a, __m128i __count)
883{
884  return __builtin_ia32_psllq128(__a, __count);
885}
886
887static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
888_mm_srai_epi16(__m128i __a, int __count)
889{
890  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
891}
892
893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
894_mm_sra_epi16(__m128i __a, __m128i __count)
895{
896  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
897}
898
899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
900_mm_srai_epi32(__m128i __a, int __count)
901{
902  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
903}
904
905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
906_mm_sra_epi32(__m128i __a, __m128i __count)
907{
908  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
909}
910
911#define _mm_srli_si128(a, imm) __extension__ ({                          \
912  (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
913                                   (__v16qi)_mm_setzero_si128(),         \
914                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
915                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
916                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
917                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
918                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
919                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
920                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
921                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
922                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
923                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
924                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
925                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
926                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
927                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
928                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
929                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
930
931#define _mm_bsrli_si128(a, imm) \
932  _mm_srli_si128((a), (imm))
933
934static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
935_mm_srli_epi16(__m128i __a, int __count)
936{
937  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
938}
939
940static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
941_mm_srl_epi16(__m128i __a, __m128i __count)
942{
943  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
944}
945
946static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
947_mm_srli_epi32(__m128i __a, int __count)
948{
949  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
950}
951
952static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
953_mm_srl_epi32(__m128i __a, __m128i __count)
954{
955  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
956}
957
958static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
959_mm_srli_epi64(__m128i __a, int __count)
960{
961  return __builtin_ia32_psrlqi128(__a, __count);
962}
963
964static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965_mm_srl_epi64(__m128i __a, __m128i __count)
966{
967  return __builtin_ia32_psrlq128(__a, __count);
968}
969
970static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971_mm_cmpeq_epi8(__m128i __a, __m128i __b)
972{
973  return (__m128i)((__v16qi)__a == (__v16qi)__b);
974}
975
976static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977_mm_cmpeq_epi16(__m128i __a, __m128i __b)
978{
979  return (__m128i)((__v8hi)__a == (__v8hi)__b);
980}
981
982static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
983_mm_cmpeq_epi32(__m128i __a, __m128i __b)
984{
985  return (__m128i)((__v4si)__a == (__v4si)__b);
986}
987
988static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989_mm_cmpgt_epi8(__m128i __a, __m128i __b)
990{
991  /* This function always performs a signed comparison, but __v16qi is a char
992     which may be signed or unsigned. */
993  typedef signed char __v16qs __attribute__((__vector_size__(16)));
994  return (__m128i)((__v16qs)__a > (__v16qs)__b);
995}
996
997static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
998_mm_cmpgt_epi16(__m128i __a, __m128i __b)
999{
1000  return (__m128i)((__v8hi)__a > (__v8hi)__b);
1001}
1002
1003static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1004_mm_cmpgt_epi32(__m128i __a, __m128i __b)
1005{
1006  return (__m128i)((__v4si)__a > (__v4si)__b);
1007}
1008
1009static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1010_mm_cmplt_epi8(__m128i __a, __m128i __b)
1011{
1012  return _mm_cmpgt_epi8(__b, __a);
1013}
1014
1015static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1016_mm_cmplt_epi16(__m128i __a, __m128i __b)
1017{
1018  return _mm_cmpgt_epi16(__b, __a);
1019}
1020
1021static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1022_mm_cmplt_epi32(__m128i __a, __m128i __b)
1023{
1024  return _mm_cmpgt_epi32(__b, __a);
1025}
1026
1027#ifdef __x86_64__
1028static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1029_mm_cvtsi64_sd(__m128d __a, long long __b)
1030{
1031  __a[0] = __b;
1032  return __a;
1033}
1034
1035static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1036_mm_cvtsd_si64(__m128d __a)
1037{
1038  return __builtin_ia32_cvtsd2si64(__a);
1039}
1040
1041static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1042_mm_cvttsd_si64(__m128d __a)
1043{
1044  return __a[0];
1045}
1046#endif
1047
1048static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1049_mm_cvtepi32_ps(__m128i __a)
1050{
1051  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1052}
1053
1054static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1055_mm_cvtps_epi32(__m128 __a)
1056{
1057  return (__m128i)__builtin_ia32_cvtps2dq(__a);
1058}
1059
1060static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1061_mm_cvttps_epi32(__m128 __a)
1062{
1063  return (__m128i)__builtin_ia32_cvttps2dq(__a);
1064}
1065
1066static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1067_mm_cvtsi32_si128(int __a)
1068{
1069  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1070}
1071
1072#ifdef __x86_64__
1073static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1074_mm_cvtsi64_si128(long long __a)
1075{
1076  return (__m128i){ __a, 0 };
1077}
1078#endif
1079
1080static __inline__ int __attribute__((__always_inline__, __nodebug__))
1081_mm_cvtsi128_si32(__m128i __a)
1082{
1083  __v4si __b = (__v4si)__a;
1084  return __b[0];
1085}
1086
1087#ifdef __x86_64__
1088static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1089_mm_cvtsi128_si64(__m128i __a)
1090{
1091  return __a[0];
1092}
1093#endif
1094
1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1096_mm_load_si128(__m128i const *__p)
1097{
1098  return *__p;
1099}
1100
1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1102_mm_loadu_si128(__m128i const *__p)
1103{
1104  struct __loadu_si128 {
1105    __m128i __v;
1106  } __attribute__((__packed__, __may_alias__));
1107  return ((struct __loadu_si128*)__p)->__v;
1108}
1109
1110static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1111_mm_loadl_epi64(__m128i const *__p)
1112{
1113  struct __mm_loadl_epi64_struct {
1114    long long __u;
1115  } __attribute__((__packed__, __may_alias__));
1116  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1117}
1118
1119static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1120_mm_set_epi64x(long long q1, long long q0)
1121{
1122  return (__m128i){ q0, q1 };
1123}
1124
1125static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1126_mm_set_epi64(__m64 q1, __m64 q0)
1127{
1128  return (__m128i){ (long long)q0, (long long)q1 };
1129}
1130
1131static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1132_mm_set_epi32(int i3, int i2, int i1, int i0)
1133{
1134  return (__m128i)(__v4si){ i0, i1, i2, i3};
1135}
1136
1137static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1138_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1139{
1140  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1141}
1142
1143static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1144_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1145{
1146  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1147}
1148
1149static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1150_mm_set1_epi64x(long long __q)
1151{
1152  return (__m128i){ __q, __q };
1153}
1154
1155static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1156_mm_set1_epi64(__m64 __q)
1157{
1158  return (__m128i){ (long long)__q, (long long)__q };
1159}
1160
1161static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1162_mm_set1_epi32(int __i)
1163{
1164  return (__m128i)(__v4si){ __i, __i, __i, __i };
1165}
1166
1167static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1168_mm_set1_epi16(short __w)
1169{
1170  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1171}
1172
1173static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1174_mm_set1_epi8(char __b)
1175{
1176  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1177}
1178
1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1180_mm_setr_epi64(__m64 q0, __m64 q1)
1181{
1182  return (__m128i){ (long long)q0, (long long)q1 };
1183}
1184
1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1186_mm_setr_epi32(int i0, int i1, int i2, int i3)
1187{
1188  return (__m128i)(__v4si){ i0, i1, i2, i3};
1189}
1190
1191static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1192_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1193{
1194  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1195}
1196
1197static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1198_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1199{
1200  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1201}
1202
1203static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1204_mm_setzero_si128(void)
1205{
1206  return (__m128i){ 0LL, 0LL };
1207}
1208
1209static __inline__ void __attribute__((__always_inline__, __nodebug__))
1210_mm_store_si128(__m128i *__p, __m128i __b)
1211{
1212  *__p = __b;
1213}
1214
1215static __inline__ void __attribute__((__always_inline__, __nodebug__))
1216_mm_storeu_si128(__m128i *__p, __m128i __b)
1217{
1218  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1219}
1220
1221static __inline__ void __attribute__((__always_inline__, __nodebug__))
1222_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1223{
1224  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1225}
1226
1227static __inline__ void __attribute__((__always_inline__, __nodebug__))
1228_mm_storel_epi64(__m128i *__p, __m128i __a)
1229{
1230  struct __mm_storel_epi64_struct {
1231    long long __u;
1232  } __attribute__((__packed__, __may_alias__));
1233  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1234}
1235
1236static __inline__ void __attribute__((__always_inline__, __nodebug__))
1237_mm_stream_pd(double *__p, __m128d __a)
1238{
1239  __builtin_ia32_movntpd(__p, __a);
1240}
1241
1242static __inline__ void __attribute__((__always_inline__, __nodebug__))
1243_mm_stream_si128(__m128i *__p, __m128i __a)
1244{
1245  __builtin_ia32_movntdq(__p, __a);
1246}
1247
1248static __inline__ void __attribute__((__always_inline__, __nodebug__))
1249_mm_stream_si32(int *__p, int __a)
1250{
1251  __builtin_ia32_movnti(__p, __a);
1252}
1253
1254#ifdef __x86_64__
1255static __inline__ void __attribute__((__always_inline__, __nodebug__))
1256_mm_stream_si64(long long *__p, long long __a)
1257{
1258  __builtin_ia32_movnti64(__p, __a);
1259}
1260#endif
1261
1262static __inline__ void __attribute__((__always_inline__, __nodebug__))
1263_mm_clflush(void const *__p)
1264{
1265  __builtin_ia32_clflush(__p);
1266}
1267
1268static __inline__ void __attribute__((__always_inline__, __nodebug__))
1269_mm_lfence(void)
1270{
1271  __builtin_ia32_lfence();
1272}
1273
1274static __inline__ void __attribute__((__always_inline__, __nodebug__))
1275_mm_mfence(void)
1276{
1277  __builtin_ia32_mfence();
1278}
1279
1280static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1281_mm_packs_epi16(__m128i __a, __m128i __b)
1282{
1283  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1284}
1285
1286static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1287_mm_packs_epi32(__m128i __a, __m128i __b)
1288{
1289  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1290}
1291
1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1293_mm_packus_epi16(__m128i __a, __m128i __b)
1294{
1295  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1296}
1297
1298static __inline__ int __attribute__((__always_inline__, __nodebug__))
1299_mm_extract_epi16(__m128i __a, int __imm)
1300{
1301  __v8hi __b = (__v8hi)__a;
1302  return (unsigned short)__b[__imm & 7];
1303}
1304
1305static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1306_mm_insert_epi16(__m128i __a, int __b, int __imm)
1307{
1308  __v8hi __c = (__v8hi)__a;
1309  __c[__imm & 7] = __b;
1310  return (__m128i)__c;
1311}
1312
1313static __inline__ int __attribute__((__always_inline__, __nodebug__))
1314_mm_movemask_epi8(__m128i __a)
1315{
1316  return __builtin_ia32_pmovmskb128((__v16qi)__a);
1317}
1318
1319#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1320  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
1321                                   (__v4si)_mm_set1_epi32(0), \
1322                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1323                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1324
1325#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1326  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1327                                   (__v8hi)_mm_set1_epi16(0), \
1328                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1329                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1330                                   4, 5, 6, 7); })
1331
1332#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1333  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1334                                   (__v8hi)_mm_set1_epi16(0), \
1335                                   0, 1, 2, 3, \
1336                                   4 + (((imm) & 0x03) >> 0), \
1337                                   4 + (((imm) & 0x0c) >> 2), \
1338                                   4 + (((imm) & 0x30) >> 4), \
1339                                   4 + (((imm) & 0xc0) >> 6)); })
1340
1341static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1342_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1343{
1344  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1345}
1346
1347static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1348_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1349{
1350  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1351}
1352
1353static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1354_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1355{
1356  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1357}
1358
1359static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1360_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1361{
1362  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1363}
1364
1365static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1366_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1367{
1368  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1369}
1370
1371static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1372_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1373{
1374  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1375}
1376
1377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1378_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1379{
1380  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1381}
1382
1383static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1384_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1385{
1386  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1387}
1388
1389static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1390_mm_movepi64_pi64(__m128i __a)
1391{
1392  return (__m64)__a[0];
1393}
1394
1395static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1396_mm_movpi64_epi64(__m64 __a)
1397{
1398  return (__m128i){ (long long)__a, 0 };
1399}
1400
1401static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1402_mm_move_epi64(__m128i __a)
1403{
1404  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1405}
1406
1407static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1408_mm_unpackhi_pd(__m128d __a, __m128d __b)
1409{
1410  return __builtin_shufflevector(__a, __b, 1, 2+1);
1411}
1412
1413static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1414_mm_unpacklo_pd(__m128d __a, __m128d __b)
1415{
1416  return __builtin_shufflevector(__a, __b, 0, 2+0);
1417}
1418
1419static __inline__ int __attribute__((__always_inline__, __nodebug__))
1420_mm_movemask_pd(__m128d __a)
1421{
1422  return __builtin_ia32_movmskpd(__a);
1423}
1424
1425#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1426  __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
1427                          (i) & 1, (((i) & 2) >> 1) + 2); })
1428
1429static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1430_mm_castpd_ps(__m128d __a)
1431{
1432  return (__m128)__a;
1433}
1434
1435static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1436_mm_castpd_si128(__m128d __a)
1437{
1438  return (__m128i)__a;
1439}
1440
1441static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1442_mm_castps_pd(__m128 __a)
1443{
1444  return (__m128d)__a;
1445}
1446
1447static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1448_mm_castps_si128(__m128 __a)
1449{
1450  return (__m128i)__a;
1451}
1452
1453static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1454_mm_castsi128_ps(__m128i __a)
1455{
1456  return (__m128)__a;
1457}
1458
1459static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1460_mm_castsi128_pd(__m128i __a)
1461{
1462  return (__m128d)__a;
1463}
1464
1465static __inline__ void __attribute__((__always_inline__, __nodebug__))
1466_mm_pause(void)
1467{
1468  __asm__ volatile ("pause");
1469}
1470
1471#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1472
1473#endif /* __SSE2__ */
1474
1475#endif /* __EMMINTRIN_H */
1476