emmintrin.h revision ae8ecdd6dbaac2fc3e10f3146ec6bae28428cea3
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef int __v4si __attribute__((__vector_size__(16)));
37typedef short __v8hi __attribute__((__vector_size__(16)));
38typedef char __v16qi __attribute__((__vector_size__(16)));
39
40static inline __m128d __attribute__((__always_inline__, __nodebug__))
41_mm_add_sd(__m128d a, __m128d b)
42{
43  return __builtin_ia32_addsd(a, b);
44}
45
46static inline __m128d __attribute__((__always_inline__, __nodebug__))
47_mm_add_pd(__m128d a, __m128d b)
48{
49  return a + b;
50}
51
52static inline __m128d __attribute__((__always_inline__, __nodebug__))
53_mm_sub_sd(__m128d a, __m128d b)
54{
55  return __builtin_ia32_subsd(a, b);
56}
57
58static inline __m128d __attribute__((__always_inline__, __nodebug__))
59_mm_sub_pd(__m128d a, __m128d b)
60{
61  return a - b;
62}
63
64static inline __m128d __attribute__((__always_inline__, __nodebug__))
65_mm_mul_sd(__m128d a, __m128d b)
66{
67  return __builtin_ia32_mulsd(a, b);
68}
69
70static inline __m128d __attribute__((__always_inline__, __nodebug__))
71_mm_mul_pd(__m128d a, __m128d b)
72{
73  return a * b;
74}
75
76static inline __m128d __attribute__((__always_inline__, __nodebug__))
77_mm_div_sd(__m128d a, __m128d b)
78{
79  return __builtin_ia32_divsd(a, b);
80}
81
82static inline __m128d __attribute__((__always_inline__, __nodebug__))
83_mm_div_pd(__m128d a, __m128d b)
84{
85  return a / b;
86}
87
88static inline __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_sqrt_sd(__m128d a, __m128d b)
90{
91  __m128d c = __builtin_ia32_sqrtsd(b);
92  return (__m128d) { c[0], a[1] };
93}
94
95static inline __m128d __attribute__((__always_inline__, __nodebug__))
96_mm_sqrt_pd(__m128d a)
97{
98  return __builtin_ia32_sqrtpd(a);
99}
100
101static inline __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_min_sd(__m128d a, __m128d b)
103{
104  return __builtin_ia32_minsd(a, b);
105}
106
107static inline __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_pd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minpd(a, b);
111}
112
113static inline __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_max_sd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_maxsd(a, b);
117}
118
119static inline __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_pd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxpd(a, b);
123}
124
125static inline __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_and_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_andpd(a, b);
129}
130
131static inline __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_andnot_pd(__m128d a, __m128d b)
133{
134  return __builtin_ia32_andnpd(a, b);
135}
136
137static inline __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_or_pd(__m128d a, __m128d b)
139{
140  return __builtin_ia32_orpd(a, b);
141}
142
143static inline __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_xor_pd(__m128d a, __m128d b)
145{
146  return __builtin_ia32_xorpd(a, b);
147}
148
149static inline __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_cmpeq_pd(__m128d a, __m128d b)
151{
152  return (__m128d)__builtin_ia32_cmpeqpd(a, b);
153}
154
155static inline __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmplt_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmpltpd(a, b);
159}
160
161static inline __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmple_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmplepd(a, b);
165}
166
167static inline __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmpgt_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmpltpd(b, a);
171}
172
173static inline __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpge_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmplepd(b, a);
177}
178
179static inline __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpord_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmpordpd(a, b);
183}
184
185static inline __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpunord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmpunordpd(a, b);
189}
190
191static inline __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpneq_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmpneqpd(a, b);
195}
196
197static inline __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpnlt_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmpnltpd(a, b);
201}
202
203static inline __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnle_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmpnlepd(a, b);
207}
208
209static inline __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpngt_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmpnltpd(b, a);
213}
214
215static inline __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpnge_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmpnlepd(b, a);
219}
220
221static inline __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpeq_sd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmpeqsd(a, b);
225}
226
227static inline __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmplt_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpltsd(a, b);
231}
232
233static inline __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmple_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmplesd(a, b);
237}
238
239static inline __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmpgt_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpltsd(b, a);
243}
244
245static inline __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpge_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmplesd(b, a);
249}
250
251static inline __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpord_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpordsd(a, b);
255}
256
257static inline __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpunord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpunordsd(a, b);
261}
262
263static inline __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpneq_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpneqsd(a, b);
267}
268
269static inline __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpnlt_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpnltsd(a, b);
273}
274
275static inline __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnle_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpnlesd(a, b);
279}
280
281static inline __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpngt_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpnltsd(b, a);
285}
286
287static inline __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpnge_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpnlesd(b, a);
291}
292
293static inline int __attribute__((__always_inline__, __nodebug__))
294_mm_comieq_sd(__m128d a, __m128d b)
295{
296  return __builtin_ia32_comisdeq(a, b);
297}
298
299static inline int __attribute__((__always_inline__, __nodebug__))
300_mm_comilt_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdlt(a, b);
303}
304
305static inline int __attribute__((__always_inline__, __nodebug__))
306_mm_comile_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdle(a, b);
309}
310
311static inline int __attribute__((__always_inline__, __nodebug__))
312_mm_comigt_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdgt(a, b);
315}
316
317static inline int __attribute__((__always_inline__, __nodebug__))
318_mm_comineq_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdneq(a, b);
321}
322
323static inline int __attribute__((__always_inline__, __nodebug__))
324_mm_ucomieq_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_ucomisdeq(a, b);
327}
328
329static inline int __attribute__((__always_inline__, __nodebug__))
330_mm_ucomilt_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_ucomisdlt(a, b);
333}
334
335static inline int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomile_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdle(a, b);
339}
340
341static inline int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomigt_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdgt(a, b);
345}
346
347static inline int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomineq_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdneq(a, b);
351}
352
353static inline __m128 __attribute__((__always_inline__, __nodebug__))
354_mm_cvtpd_ps(__m128d a)
355{
356  return __builtin_ia32_cvtpd2ps(a);
357}
358
359static inline __m128d __attribute__((__always_inline__, __nodebug__))
360_mm_cvtps_pd(__m128 a)
361{
362  return __builtin_ia32_cvtps2pd(a);
363}
364
365static inline __m128d __attribute__((__always_inline__, __nodebug__))
366_mm_cvtepi32_pd(__m128i a)
367{
368  return __builtin_ia32_cvtdq2pd((__v4si)a);
369}
370
371static inline __m128i __attribute__((__always_inline__, __nodebug__))
372_mm_cvtpd_epi32(__m128d a)
373{
374  return __builtin_ia32_cvtpd2dq(a);
375}
376
377static inline int __attribute__((__always_inline__, __nodebug__))
378_mm_cvtsd_si32(__m128d a)
379{
380  return __builtin_ia32_cvtsd2si(a);
381}
382
383static inline __m128 __attribute__((__always_inline__, __nodebug__))
384_mm_cvtsd_ss(__m128 a, __m128d b)
385{
386  return __builtin_ia32_cvtsd2ss(a, b);
387}
388
389static inline __m128d __attribute__((__always_inline__, __nodebug__))
390_mm_cvtsi32_sd(__m128d a, int b)
391{
392  return __builtin_ia32_cvtsi2sd(a, b);
393}
394
395static inline __m128d __attribute__((__always_inline__, __nodebug__))
396_mm_cvtss_sd(__m128d a, __m128 b)
397{
398  return __builtin_ia32_cvtss2sd(a, b);
399}
400
401static inline __m128i __attribute__((__always_inline__, __nodebug__))
402_mm_cvttpd_epi32(__m128d a)
403{
404  return (__m128i)__builtin_ia32_cvttpd2dq(a);
405}
406
407static inline int __attribute__((__always_inline__, __nodebug__))
408_mm_cvttsd_si32(__m128d a)
409{
410  return __builtin_ia32_cvttsd2si(a);
411}
412
413static inline __m64 __attribute__((__always_inline__, __nodebug__))
414_mm_cvtpd_pi32(__m128d a)
415{
416  return (__m64)__builtin_ia32_cvtpd2pi(a);
417}
418
419static inline __m64 __attribute__((__always_inline__, __nodebug__))
420_mm_cvttpd_pi32(__m128d a)
421{
422  return (__m64)__builtin_ia32_cvttpd2pi(a);
423}
424
425static inline __m128d __attribute__((__always_inline__, __nodebug__))
426_mm_cvtpi32_pd(__m64 a)
427{
428  return __builtin_ia32_cvtpi2pd((__v2si)a);
429}
430
431static inline double __attribute__((__always_inline__, __nodebug__))
432_mm_cvtsd_f64(__m128d a)
433{
434  return a[0];
435}
436
437static inline __m128d __attribute__((__always_inline__, __nodebug__))
438_mm_load_pd(double const *dp)
439{
440  return *(__m128d*)dp;
441}
442
443static inline __m128d __attribute__((__always_inline__, __nodebug__))
444_mm_load1_pd(double const *dp)
445{
446  return (__m128d){ dp[0], dp[0] };
447}
448
449static inline __m128d __attribute__((__always_inline__, __nodebug__))
450_mm_loadr_pd(double const *dp)
451{
452  return (__m128d){ dp[1], dp[0] };
453}
454
455static inline __m128d __attribute__((__always_inline__, __nodebug__))
456_mm_loadu_pd(double const *dp)
457{
458  return __builtin_ia32_loadupd(dp);
459}
460
461static inline __m128d __attribute__((__always_inline__, __nodebug__))
462_mm_load_sd(double const *dp)
463{
464  return (__m128d){ *dp, 0.0 };
465}
466
467static inline __m128d __attribute__((__always_inline__, __nodebug__))
468_mm_loadh_pd(__m128d a, double const *dp)
469{
470  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
471}
472
473static inline __m128d __attribute__((__always_inline__, __nodebug__))
474_mm_loadl_pd(__m128d a, double const *dp)
475{
476  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
477}
478
479static inline __m128d __attribute__((__always_inline__, __nodebug__))
480_mm_set_sd(double w)
481{
482  return (__m128d){ w, 0 };
483}
484
485static inline __m128d __attribute__((__always_inline__, __nodebug__))
486_mm_set1_pd(double w)
487{
488  return (__m128d){ w, w };
489}
490
491static inline __m128d __attribute__((__always_inline__, __nodebug__))
492_mm_set_pd(double w, double x)
493{
494  return (__m128d){ w, x };
495}
496
497static inline __m128d __attribute__((__always_inline__, __nodebug__))
498_mm_setr_pd(double w, double x)
499{
500  return (__m128d){ x, w };
501}
502
503static inline __m128d __attribute__((__always_inline__, __nodebug__))
504_mm_setzero_pd(void)
505{
506  return (__m128d){ 0, 0 };
507}
508
509static inline __m128d __attribute__((__always_inline__, __nodebug__))
510_mm_move_sd(__m128d a, __m128d b)
511{
512  return (__m128d){ b[0], a[1] };
513}
514
515static inline void __attribute__((__always_inline__, __nodebug__))
516_mm_store_sd(double *dp, __m128d a)
517{
518  dp[0] = a[0];
519}
520
521static inline void __attribute__((__always_inline__, __nodebug__))
522_mm_store1_pd(double *dp, __m128d a)
523{
524  dp[0] = a[0];
525  dp[1] = a[0];
526}
527
528static inline void __attribute__((__always_inline__, __nodebug__))
529_mm_store_pd(double *dp, __m128d a)
530{
531  *(__m128d *)dp = a;
532}
533
534static inline void __attribute__((__always_inline__, __nodebug__))
535_mm_storeu_pd(double *dp, __m128d a)
536{
537  __builtin_ia32_storeupd(dp, a);
538}
539
540static inline void __attribute__((__always_inline__, __nodebug__))
541_mm_storer_pd(double *dp, __m128d a)
542{
543  dp[0] = a[1];
544  dp[1] = a[0];
545}
546
547static inline void __attribute__((__always_inline__, __nodebug__))
548_mm_storeh_pd(double *dp, __m128d a)
549{
550  dp[0] = a[1];
551}
552
553static inline void __attribute__((__always_inline__, __nodebug__))
554_mm_storel_pd(double *dp, __m128d a)
555{
556  dp[0] = a[0];
557}
558
559static inline __m128i __attribute__((__always_inline__, __nodebug__))
560_mm_add_epi8(__m128i a, __m128i b)
561{
562  return (__m128i)((__v16qi)a + (__v16qi)b);
563}
564
565static inline __m128i __attribute__((__always_inline__, __nodebug__))
566_mm_add_epi16(__m128i a, __m128i b)
567{
568  return (__m128i)((__v8hi)a + (__v8hi)b);
569}
570
571static inline __m128i __attribute__((__always_inline__, __nodebug__))
572_mm_add_epi32(__m128i a, __m128i b)
573{
574  return (__m128i)((__v4si)a + (__v4si)b);
575}
576
577static inline __m64 __attribute__((__always_inline__, __nodebug__))
578_mm_add_si64(__m64 a, __m64 b)
579{
580  return a + b;
581}
582
583static inline __m128i __attribute__((__always_inline__, __nodebug__))
584_mm_add_epi64(__m128i a, __m128i b)
585{
586  return a + b;
587}
588
589static inline __m128i __attribute__((__always_inline__, __nodebug__))
590_mm_adds_epi8(__m128i a, __m128i b)
591{
592  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
593}
594
595static inline __m128i __attribute__((__always_inline__, __nodebug__))
596_mm_adds_epi16(__m128i a, __m128i b)
597{
598  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
599}
600
601static inline __m128i __attribute__((__always_inline__, __nodebug__))
602_mm_adds_epu8(__m128i a, __m128i b)
603{
604  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
605}
606
607static inline __m128i __attribute__((__always_inline__, __nodebug__))
608_mm_adds_epu16(__m128i a, __m128i b)
609{
610  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
611}
612
613static inline __m128i __attribute__((__always_inline__, __nodebug__))
614_mm_avg_epu8(__m128i a, __m128i b)
615{
616  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
617}
618
619static inline __m128i __attribute__((__always_inline__, __nodebug__))
620_mm_avg_epu16(__m128i a, __m128i b)
621{
622  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
623}
624
625static inline __m128i __attribute__((__always_inline__, __nodebug__))
626_mm_madd_epi16(__m128i a, __m128i b)
627{
628  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
629}
630
631static inline __m128i __attribute__((__always_inline__, __nodebug__))
632_mm_max_epi16(__m128i a, __m128i b)
633{
634  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
635}
636
637static inline __m128i __attribute__((__always_inline__, __nodebug__))
638_mm_max_epu8(__m128i a, __m128i b)
639{
640  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
641}
642
643static inline __m128i __attribute__((__always_inline__, __nodebug__))
644_mm_min_epi16(__m128i a, __m128i b)
645{
646  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
647}
648
649static inline __m128i __attribute__((__always_inline__, __nodebug__))
650_mm_min_epu8(__m128i a, __m128i b)
651{
652  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
653}
654
655static inline __m128i __attribute__((__always_inline__, __nodebug__))
656_mm_mulhi_epi16(__m128i a, __m128i b)
657{
658  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
659}
660
661static inline __m128i __attribute__((__always_inline__, __nodebug__))
662_mm_mulhi_epu16(__m128i a, __m128i b)
663{
664  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
665}
666
667static inline __m128i __attribute__((__always_inline__, __nodebug__))
668_mm_mullo_epi16(__m128i a, __m128i b)
669{
670  return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
671}
672
673static inline __m64 __attribute__((__always_inline__, __nodebug__))
674_mm_mul_su32(__m64 a, __m64 b)
675{
676  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
677}
678
679static inline __m128i __attribute__((__always_inline__, __nodebug__))
680_mm_mul_epu32(__m128i a, __m128i b)
681{
682  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
683}
684
685static inline __m128i __attribute__((__always_inline__, __nodebug__))
686_mm_sad_epu8(__m128i a, __m128i b)
687{
688  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
689}
690
691static inline __m128i __attribute__((__always_inline__, __nodebug__))
692_mm_sub_epi8(__m128i a, __m128i b)
693{
694  return (__m128i)((__v16qi)a - (__v16qi)b);
695}
696
697static inline __m128i __attribute__((__always_inline__, __nodebug__))
698_mm_sub_epi16(__m128i a, __m128i b)
699{
700  return (__m128i)((__v8hi)a - (__v8hi)b);
701}
702
703static inline __m128i __attribute__((__always_inline__, __nodebug__))
704_mm_sub_epi32(__m128i a, __m128i b)
705{
706  return (__m128i)((__v4si)a - (__v4si)b);
707}
708
709static inline __m64 __attribute__((__always_inline__, __nodebug__))
710_mm_sub_si64(__m64 a, __m64 b)
711{
712  return a - b;
713}
714
715static inline __m128i __attribute__((__always_inline__, __nodebug__))
716_mm_sub_epi64(__m128i a, __m128i b)
717{
718  return a - b;
719}
720
721static inline __m128i __attribute__((__always_inline__, __nodebug__))
722_mm_subs_epi8(__m128i a, __m128i b)
723{
724  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
725}
726
727static inline __m128i __attribute__((__always_inline__, __nodebug__))
728_mm_subs_epi16(__m128i a, __m128i b)
729{
730  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
731}
732
733static inline __m128i __attribute__((__always_inline__, __nodebug__))
734_mm_subs_epu8(__m128i a, __m128i b)
735{
736  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
737}
738
739static inline __m128i __attribute__((__always_inline__, __nodebug__))
740_mm_subs_epu16(__m128i a, __m128i b)
741{
742  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
743}
744
745static inline __m128i __attribute__((__always_inline__, __nodebug__))
746_mm_and_si128(__m128i a, __m128i b)
747{
748  return __builtin_ia32_pand128(a, b);
749}
750
751static inline __m128i __attribute__((__always_inline__, __nodebug__))
752_mm_andnot_si128(__m128i a, __m128i b)
753{
754  return __builtin_ia32_pandn128(a, b);
755}
756
757static inline __m128i __attribute__((__always_inline__, __nodebug__))
758_mm_or_si128(__m128i a, __m128i b)
759{
760  return __builtin_ia32_por128(a, b);
761}
762
763static inline __m128i __attribute__((__always_inline__, __nodebug__))
764_mm_xor_si128(__m128i a, __m128i b)
765{
766  return __builtin_ia32_pxor128(a, b);
767}
768
769static inline __m128i __attribute__((__always_inline__, __nodebug__))
770_mm_slli_si128(__m128i a, int imm)
771{
772  return __builtin_ia32_pslldqi128(a, imm * 8);
773}
774
775static inline __m128i __attribute__((__always_inline__, __nodebug__))
776_mm_slli_epi16(__m128i a, int count)
777{
778  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
779}
780
781static inline __m128i __attribute__((__always_inline__, __nodebug__))
782_mm_sll_epi16(__m128i a, __m128i count)
783{
784  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
785}
786
787static inline __m128i __attribute__((__always_inline__, __nodebug__))
788_mm_slli_epi32(__m128i a, int count)
789{
790  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
791}
792
793static inline __m128i __attribute__((__always_inline__, __nodebug__))
794_mm_sll_epi32(__m128i a, __m128i count)
795{
796  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
797}
798
799static inline __m128i __attribute__((__always_inline__, __nodebug__))
800_mm_slli_epi64(__m128i a, int count)
801{
802  return __builtin_ia32_psllqi128(a, count);
803}
804
805static inline __m128i __attribute__((__always_inline__, __nodebug__))
806_mm_sll_epi64(__m128i a, __m128i count)
807{
808  return __builtin_ia32_psllq128(a, count);
809}
810
811static inline __m128i __attribute__((__always_inline__, __nodebug__))
812_mm_srai_epi16(__m128i a, int count)
813{
814  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
815}
816
817static inline __m128i __attribute__((__always_inline__, __nodebug__))
818_mm_sra_epi16(__m128i a, __m128i count)
819{
820  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
821}
822
823static inline __m128i __attribute__((__always_inline__, __nodebug__))
824_mm_srai_epi32(__m128i a, int count)
825{
826  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
827}
828
829static inline __m128i __attribute__((__always_inline__, __nodebug__))
830_mm_sra_epi32(__m128i a, __m128i count)
831{
832  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
833}
834
835static inline __m128i __attribute__((__always_inline__, __nodebug__))
836_mm_srli_si128(__m128i a, int imm)
837{
838  return __builtin_ia32_psrldqi128(a, imm * 8);
839}
840
841static inline __m128i __attribute__((__always_inline__, __nodebug__))
842_mm_srli_epi16(__m128i a, int count)
843{
844  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
845}
846
847static inline __m128i __attribute__((__always_inline__, __nodebug__))
848_mm_srl_epi16(__m128i a, __m128i count)
849{
850  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
851}
852
853static inline __m128i __attribute__((__always_inline__, __nodebug__))
854_mm_srli_epi32(__m128i a, int count)
855{
856  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
857}
858
859static inline __m128i __attribute__((__always_inline__, __nodebug__))
860_mm_srl_epi32(__m128i a, __m128i count)
861{
862  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
863}
864
865static inline __m128i __attribute__((__always_inline__, __nodebug__))
866_mm_srli_epi64(__m128i a, int count)
867{
868  return __builtin_ia32_psrlqi128(a, count);
869}
870
871static inline __m128i __attribute__((__always_inline__, __nodebug__))
872_mm_srl_epi64(__m128i a, __m128i count)
873{
874  return __builtin_ia32_psrlq128(a, count);
875}
876
877static inline __m128i __attribute__((__always_inline__, __nodebug__))
878_mm_cmpeq_epi8(__m128i a, __m128i b)
879{
880  return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
881}
882
883static inline __m128i __attribute__((__always_inline__, __nodebug__))
884_mm_cmpeq_epi16(__m128i a, __m128i b)
885{
886  return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
887}
888
889static inline __m128i __attribute__((__always_inline__, __nodebug__))
890_mm_cmpeq_epi32(__m128i a, __m128i b)
891{
892  return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
893}
894
895static inline __m128i __attribute__((__always_inline__, __nodebug__))
896_mm_cmpgt_epi8(__m128i a, __m128i b)
897{
898  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
899}
900
901static inline __m128i __attribute__((__always_inline__, __nodebug__))
902_mm_cmpgt_epi16(__m128i a, __m128i b)
903{
904  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
905}
906
907static inline __m128i __attribute__((__always_inline__, __nodebug__))
908_mm_cmpgt_epi32(__m128i a, __m128i b)
909{
910  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
911}
912
913static inline __m128i __attribute__((__always_inline__, __nodebug__))
914_mm_cmplt_epi8(__m128i a, __m128i b)
915{
916  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
917}
918
919static inline __m128i __attribute__((__always_inline__, __nodebug__))
920_mm_cmplt_epi16(__m128i a, __m128i b)
921{
922  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
923}
924
925static inline __m128i __attribute__((__always_inline__, __nodebug__))
926_mm_cmplt_epi32(__m128i a, __m128i b)
927{
928  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
929}
930
931#ifdef __x86_64__
932static inline __m128d __attribute__((__always_inline__, __nodebug__))
933_mm_cvtsi64_sd(__m128d a, long long b)
934{
935  return __builtin_ia32_cvtsi642sd(a, b);
936}
937
938static inline long long __attribute__((__always_inline__, __nodebug__))
939_mm_cvtsd_si64(__m128d a)
940{
941  return __builtin_ia32_cvtsd2si64(a);
942}
943
944static inline long long __attribute__((__always_inline__, __nodebug__))
945_mm_cvttsd_si64(__m128d a)
946{
947  return __builtin_ia32_cvttsd2si64(a);
948}
949#endif
950
951static inline __m128 __attribute__((__always_inline__, __nodebug__))
952_mm_cvtepi32_ps(__m128i a)
953{
954  return __builtin_ia32_cvtdq2ps((__v4si)a);
955}
956
957static inline __m128i __attribute__((__always_inline__, __nodebug__))
958_mm_cvtps_epi32(__m128 a)
959{
960  return (__m128i)__builtin_ia32_cvtps2dq(a);
961}
962
963static inline __m128i __attribute__((__always_inline__, __nodebug__))
964_mm_cvttps_epi32(__m128 a)
965{
966  return (__m128i)__builtin_ia32_cvttps2dq(a);
967}
968
969static inline __m128i __attribute__((__always_inline__, __nodebug__))
970_mm_cvtsi32_si128(int a)
971{
972  return (__m128i)(__v4si){ a, 0, 0, 0 };
973}
974
975#ifdef __x86_64__
976static inline __m128i __attribute__((__always_inline__, __nodebug__))
977_mm_cvtsi64_si128(long long a)
978{
979  return (__m128i){ a, 0 };
980}
981#endif
982
983static inline int __attribute__((__always_inline__, __nodebug__))
984_mm_cvtsi128_si32(__m128i a)
985{
986  __v4si b = (__v4si)a;
987  return b[0];
988}
989
990#ifdef __x86_64__
991static inline long long __attribute__((__always_inline__, __nodebug__))
992_mm_cvtsi128_si64(__m128i a)
993{
994  return a[0];
995}
996#endif
997
998static inline __m128i __attribute__((__always_inline__, __nodebug__))
999_mm_load_si128(__m128i const *p)
1000{
1001  return *p;
1002}
1003
1004static inline __m128i __attribute__((__always_inline__, __nodebug__))
1005_mm_loadu_si128(__m128i const *p)
1006{
1007  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1008}
1009
1010static inline __m128i __attribute__((__always_inline__, __nodebug__))
1011_mm_loadl_epi64(__m128i const *p)
1012{
1013  return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
1014}
1015
1016static inline __m128i __attribute__((__always_inline__, __nodebug__))
1017_mm_set_epi64(__m64 q1, __m64 q0)
1018{
1019  return (__m128i){ (long long)q0, (long long)q1 };
1020}
1021
1022static inline __m128i __attribute__((__always_inline__, __nodebug__))
1023_mm_set_epi32(int i3, int i2, int i1, int i0)
1024{
1025  return (__m128i)(__v4si){ i0, i1, i2, i3};
1026}
1027
1028static inline __m128i __attribute__((__always_inline__, __nodebug__))
1029_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1030{
1031  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1032}
1033
1034static inline __m128i __attribute__((__always_inline__, __nodebug__))
1035_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1036{
1037  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1038}
1039
1040static inline __m128i __attribute__((__always_inline__, __nodebug__))
1041_mm_set1_epi64(__m64 q)
1042{
1043  return (__m128i){ (long long)q, (long long)q };
1044}
1045
1046static inline __m128i __attribute__((__always_inline__, __nodebug__))
1047_mm_set1_epi32(int i)
1048{
1049  return (__m128i)(__v4si){ i, i, i, i };
1050}
1051
1052static inline __m128i __attribute__((__always_inline__, __nodebug__))
1053_mm_set1_epi16(short w)
1054{
1055  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1056}
1057
1058static inline __m128i __attribute__((__always_inline__, __nodebug__))
1059_mm_set1_epi8(char b)
1060{
1061  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1062}
1063
1064static inline __m128i __attribute__((__always_inline__, __nodebug__))
1065_mm_setr_epi64(__m64 q0, __m64 q1)
1066{
1067  return (__m128i){ (long long)q0, (long long)q1 };
1068}
1069
1070static inline __m128i __attribute__((__always_inline__, __nodebug__))
1071_mm_setr_epi32(int i0, int i1, int i2, int i3)
1072{
1073  return (__m128i)(__v4si){ i0, i1, i2, i3};
1074}
1075
1076static inline __m128i __attribute__((__always_inline__, __nodebug__))
1077_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1078{
1079  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1080}
1081
1082static inline __m128i __attribute__((__always_inline__, __nodebug__))
1083_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1084{
1085  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1086}
1087
1088static inline __m128i __attribute__((__always_inline__, __nodebug__))
1089_mm_setzero_si128(void)
1090{
1091  return (__m128i){ 0LL, 0LL };
1092}
1093
1094static inline void __attribute__((__always_inline__, __nodebug__))
1095_mm_store_si128(__m128i *p, __m128i b)
1096{
1097  *p = b;
1098}
1099
1100static inline void __attribute__((__always_inline__, __nodebug__))
1101_mm_storeu_si128(__m128i *p, __m128i b)
1102{
1103  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1104}
1105
1106static inline void __attribute__((__always_inline__, __nodebug__))
1107_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1108{
1109  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1110}
1111
1112static inline void __attribute__((__always_inline__, __nodebug__))
1113_mm_storel_epi64(__m128i *p, __m128i a)
1114{
1115  __builtin_ia32_storelv4si((__v2si *)p, a);
1116}
1117
1118static inline void __attribute__((__always_inline__, __nodebug__))
1119_mm_stream_pd(double *p, __m128d a)
1120{
1121  __builtin_ia32_movntpd(p, a);
1122}
1123
1124static inline void __attribute__((__always_inline__, __nodebug__))
1125_mm_stream_si128(__m128i *p, __m128i a)
1126{
1127  __builtin_ia32_movntdq(p, a);
1128}
1129
1130static inline void __attribute__((__always_inline__, __nodebug__))
1131_mm_stream_si32(int *p, int a)
1132{
1133  __builtin_ia32_movnti(p, a);
1134}
1135
1136static inline void __attribute__((__always_inline__, __nodebug__))
1137_mm_clflush(void const *p)
1138{
1139  __builtin_ia32_clflush(p);
1140}
1141
1142static inline void __attribute__((__always_inline__, __nodebug__))
1143_mm_lfence(void)
1144{
1145  __builtin_ia32_lfence();
1146}
1147
1148static inline void __attribute__((__always_inline__, __nodebug__))
1149_mm_mfence(void)
1150{
1151  __builtin_ia32_mfence();
1152}
1153
1154static inline __m128i __attribute__((__always_inline__, __nodebug__))
1155_mm_packs_epi16(__m128i a, __m128i b)
1156{
1157  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1158}
1159
1160static inline __m128i __attribute__((__always_inline__, __nodebug__))
1161_mm_packs_epi32(__m128i a, __m128i b)
1162{
1163  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1164}
1165
1166static inline __m128i __attribute__((__always_inline__, __nodebug__))
1167_mm_packus_epi16(__m128i a, __m128i b)
1168{
1169  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1170}
1171
1172static inline int __attribute__((__always_inline__, __nodebug__))
1173_mm_extract_epi16(__m128i a, int imm)
1174{
1175  __v8hi b = (__v8hi)a;
1176  return b[imm];
1177}
1178
1179static inline __m128i __attribute__((__always_inline__, __nodebug__))
1180_mm_insert_epi16(__m128i a, int b, int imm)
1181{
1182  return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
1183}
1184
1185static inline int __attribute__((__always_inline__, __nodebug__))
1186_mm_movemask_epi8(__m128i a)
1187{
1188  return __builtin_ia32_pmovmskb128((__v16qi)a);
1189}
1190
1191#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm)))
1192#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
1193#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
1194
1195static inline __m128i __attribute__((__always_inline__, __nodebug__))
1196_mm_unpackhi_epi8(__m128i a, __m128i b)
1197{
1198  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1199}
1200
1201static inline __m128i __attribute__((__always_inline__, __nodebug__))
1202_mm_unpackhi_epi16(__m128i a, __m128i b)
1203{
1204  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1205}
1206
1207static inline __m128i __attribute__((__always_inline__, __nodebug__))
1208_mm_unpackhi_epi32(__m128i a, __m128i b)
1209{
1210  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1211}
1212
1213static inline __m128i __attribute__((__always_inline__, __nodebug__))
1214_mm_unpackhi_epi64(__m128i a, __m128i b)
1215{
1216  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1217}
1218
1219static inline __m128i __attribute__((__always_inline__, __nodebug__))
1220_mm_unpacklo_epi8(__m128i a, __m128i b)
1221{
1222  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1223}
1224
1225static inline __m128i __attribute__((__always_inline__, __nodebug__))
1226_mm_unpacklo_epi16(__m128i a, __m128i b)
1227{
1228  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1229}
1230
1231static inline __m128i __attribute__((__always_inline__, __nodebug__))
1232_mm_unpacklo_epi32(__m128i a, __m128i b)
1233{
1234  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1235}
1236
1237static inline __m128i __attribute__((__always_inline__, __nodebug__))
1238_mm_unpacklo_epi64(__m128i a, __m128i b)
1239{
1240  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1241}
1242
1243static inline __m64 __attribute__((__always_inline__, __nodebug__))
1244_mm_movepi64_pi64(__m128i a)
1245{
1246  return (__m64)a[0];
1247}
1248
1249static inline __m128i __attribute__((__always_inline__, __nodebug__))
1250_mm_movpi64_pi64(__m64 a)
1251{
1252  return (__m128i){ (long long)a, 0 };
1253}
1254
1255static inline __m128i __attribute__((__always_inline__, __nodebug__))
1256_mm_move_epi64(__m128i a)
1257{
1258  return (__m128i){ a[0], 0 };
1259}
1260
1261static inline __m128d __attribute__((__always_inline__, __nodebug__))
1262_mm_unpackhi_pd(__m128d a, __m128d b)
1263{
1264  return __builtin_shufflevector(a, b, 1, 2+1);
1265}
1266
1267static inline __m128d __attribute__((__always_inline__, __nodebug__))
1268_mm_unpacklo_pd(__m128d a, __m128d b)
1269{
1270  return __builtin_shufflevector(a, b, 0, 2+0);
1271}
1272
1273static inline int __attribute__((__always_inline__, __nodebug__))
1274_mm_movemask_pd(__m128d a)
1275{
1276  return __builtin_ia32_movmskpd(a);
1277}
1278
1279#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
1280
1281static inline __m128 __attribute__((__always_inline__, __nodebug__))
1282_mm_castpd_ps(__m128d in)
1283{
1284  return (__m128)in;
1285}
1286
1287static inline __m128i __attribute__((__always_inline__, __nodebug__))
1288_mm_castpd_si128(__m128d in)
1289{
1290  return (__m128i)in;
1291}
1292
1293static inline __m128d __attribute__((__always_inline__, __nodebug__))
1294_mm_castps_pd(__m128 in)
1295{
1296  return (__m128d)in;
1297}
1298
1299static inline __m128i __attribute__((__always_inline__, __nodebug__))
1300_mm_castps_si128(__m128 in)
1301{
1302  return (__m128i)in;
1303}
1304
1305static inline __m128 __attribute__((__always_inline__, __nodebug__))
1306_mm_castsi128_ps(__m128i in)
1307{
1308  return (__m128)in;
1309}
1310
1311static inline __m128d __attribute__((__always_inline__, __nodebug__))
1312_mm_castsi128_pd(__m128i in)
1313{
1314  return (__m128d)in;
1315}
1316
1317static inline void __attribute__((__always_inline__, __nodebug__))
1318_mm_pause(void)
1319{
1320  __asm__ volatile ("pause");
1321}
1322
1323#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1324
1325#endif /* __SSE2__ */
1326
1327#endif /* __EMMINTRIN_H */
1328