emmintrin.h revision 80c800465865aa15ec4b094407170c149ce344cd
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef int __v4si __attribute__((__vector_size__(16)));
37typedef short __v8hi __attribute__((__vector_size__(16)));
38typedef char __v16qi __attribute__((__vector_size__(16)));
39
40static inline __m128d __attribute__((__always_inline__, __nodebug__))
41_mm_add_sd(__m128d a, __m128d b)
42{
43  a[0] += b[0];
44  return a;
45}
46
47static inline __m128d __attribute__((__always_inline__, __nodebug__))
48_mm_add_pd(__m128d a, __m128d b)
49{
50  return a + b;
51}
52
53static inline __m128d __attribute__((__always_inline__, __nodebug__))
54_mm_sub_sd(__m128d a, __m128d b)
55{
56  a[0] -= b[0];
57  return a;
58}
59
60static inline __m128d __attribute__((__always_inline__, __nodebug__))
61_mm_sub_pd(__m128d a, __m128d b)
62{
63  return a - b;
64}
65
66static inline __m128d __attribute__((__always_inline__, __nodebug__))
67_mm_mul_sd(__m128d a, __m128d b)
68{
69  a[0] *= b[0];
70  return a;
71}
72
73static inline __m128d __attribute__((__always_inline__, __nodebug__))
74_mm_mul_pd(__m128d a, __m128d b)
75{
76  return a * b;
77}
78
79static inline __m128d __attribute__((__always_inline__, __nodebug__))
80_mm_div_sd(__m128d a, __m128d b)
81{
82  a[0] /= b[0];
83  return a;
84}
85
86static inline __m128d __attribute__((__always_inline__, __nodebug__))
87_mm_div_pd(__m128d a, __m128d b)
88{
89  return a / b;
90}
91
92static inline __m128d __attribute__((__always_inline__, __nodebug__))
93_mm_sqrt_sd(__m128d a, __m128d b)
94{
95  __m128d c = __builtin_ia32_sqrtsd(b);
96  return (__m128d) { c[0], a[1] };
97}
98
99static inline __m128d __attribute__((__always_inline__, __nodebug__))
100_mm_sqrt_pd(__m128d a)
101{
102  return __builtin_ia32_sqrtpd(a);
103}
104
105static inline __m128d __attribute__((__always_inline__, __nodebug__))
106_mm_min_sd(__m128d a, __m128d b)
107{
108  return __builtin_ia32_minsd(a, b);
109}
110
111static inline __m128d __attribute__((__always_inline__, __nodebug__))
112_mm_min_pd(__m128d a, __m128d b)
113{
114  return __builtin_ia32_minpd(a, b);
115}
116
117static inline __m128d __attribute__((__always_inline__, __nodebug__))
118_mm_max_sd(__m128d a, __m128d b)
119{
120  return __builtin_ia32_maxsd(a, b);
121}
122
123static inline __m128d __attribute__((__always_inline__, __nodebug__))
124_mm_max_pd(__m128d a, __m128d b)
125{
126  return __builtin_ia32_maxpd(a, b);
127}
128
129static inline __m128d __attribute__((__always_inline__, __nodebug__))
130_mm_and_pd(__m128d a, __m128d b)
131{
132  return (__m128)((__v4si)a & (__v4si)b);
133}
134
135static inline __m128d __attribute__((__always_inline__, __nodebug__))
136_mm_andnot_pd(__m128d a, __m128d b)
137{
138  return (__m128)(~(__v4si)a & (__v4si)b);
139}
140
141static inline __m128d __attribute__((__always_inline__, __nodebug__))
142_mm_or_pd(__m128d a, __m128d b)
143{
144  return (__m128)((__v4si)a | (__v4si)b);
145}
146
147static inline __m128d __attribute__((__always_inline__, __nodebug__))
148_mm_xor_pd(__m128d a, __m128d b)
149{
150  return (__m128)((__v4si)a ^ (__v4si)b);
151}
152
153static inline __m128d __attribute__((__always_inline__, __nodebug__))
154_mm_cmpeq_pd(__m128d a, __m128d b)
155{
156  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
157}
158
159static inline __m128d __attribute__((__always_inline__, __nodebug__))
160_mm_cmplt_pd(__m128d a, __m128d b)
161{
162  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
163}
164
165static inline __m128d __attribute__((__always_inline__, __nodebug__))
166_mm_cmple_pd(__m128d a, __m128d b)
167{
168  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
169}
170
171static inline __m128d __attribute__((__always_inline__, __nodebug__))
172_mm_cmpgt_pd(__m128d a, __m128d b)
173{
174  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
175}
176
177static inline __m128d __attribute__((__always_inline__, __nodebug__))
178_mm_cmpge_pd(__m128d a, __m128d b)
179{
180  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
181}
182
183static inline __m128d __attribute__((__always_inline__, __nodebug__))
184_mm_cmpord_pd(__m128d a, __m128d b)
185{
186  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
187}
188
189static inline __m128d __attribute__((__always_inline__, __nodebug__))
190_mm_cmpunord_pd(__m128d a, __m128d b)
191{
192  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
193}
194
195static inline __m128d __attribute__((__always_inline__, __nodebug__))
196_mm_cmpneq_pd(__m128d a, __m128d b)
197{
198  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
199}
200
201static inline __m128d __attribute__((__always_inline__, __nodebug__))
202_mm_cmpnlt_pd(__m128d a, __m128d b)
203{
204  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
205}
206
207static inline __m128d __attribute__((__always_inline__, __nodebug__))
208_mm_cmpnle_pd(__m128d a, __m128d b)
209{
210  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
211}
212
213static inline __m128d __attribute__((__always_inline__, __nodebug__))
214_mm_cmpngt_pd(__m128d a, __m128d b)
215{
216  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
217}
218
219static inline __m128d __attribute__((__always_inline__, __nodebug__))
220_mm_cmpnge_pd(__m128d a, __m128d b)
221{
222  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
223}
224
225static inline __m128d __attribute__((__always_inline__, __nodebug__))
226_mm_cmpeq_sd(__m128d a, __m128d b)
227{
228  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
229}
230
231static inline __m128d __attribute__((__always_inline__, __nodebug__))
232_mm_cmplt_sd(__m128d a, __m128d b)
233{
234  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
235}
236
237static inline __m128d __attribute__((__always_inline__, __nodebug__))
238_mm_cmple_sd(__m128d a, __m128d b)
239{
240  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
241}
242
243static inline __m128d __attribute__((__always_inline__, __nodebug__))
244_mm_cmpgt_sd(__m128d a, __m128d b)
245{
246  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
247}
248
249static inline __m128d __attribute__((__always_inline__, __nodebug__))
250_mm_cmpge_sd(__m128d a, __m128d b)
251{
252  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
253}
254
255static inline __m128d __attribute__((__always_inline__, __nodebug__))
256_mm_cmpord_sd(__m128d a, __m128d b)
257{
258  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
259}
260
261static inline __m128d __attribute__((__always_inline__, __nodebug__))
262_mm_cmpunord_sd(__m128d a, __m128d b)
263{
264  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
265}
266
267static inline __m128d __attribute__((__always_inline__, __nodebug__))
268_mm_cmpneq_sd(__m128d a, __m128d b)
269{
270  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
271}
272
273static inline __m128d __attribute__((__always_inline__, __nodebug__))
274_mm_cmpnlt_sd(__m128d a, __m128d b)
275{
276  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
277}
278
279static inline __m128d __attribute__((__always_inline__, __nodebug__))
280_mm_cmpnle_sd(__m128d a, __m128d b)
281{
282  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
283}
284
285static inline __m128d __attribute__((__always_inline__, __nodebug__))
286_mm_cmpngt_sd(__m128d a, __m128d b)
287{
288  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
289}
290
291static inline __m128d __attribute__((__always_inline__, __nodebug__))
292_mm_cmpnge_sd(__m128d a, __m128d b)
293{
294  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
295}
296
297static inline int __attribute__((__always_inline__, __nodebug__))
298_mm_comieq_sd(__m128d a, __m128d b)
299{
300  return __builtin_ia32_comisdeq(a, b);
301}
302
303static inline int __attribute__((__always_inline__, __nodebug__))
304_mm_comilt_sd(__m128d a, __m128d b)
305{
306  return __builtin_ia32_comisdlt(a, b);
307}
308
309static inline int __attribute__((__always_inline__, __nodebug__))
310_mm_comile_sd(__m128d a, __m128d b)
311{
312  return __builtin_ia32_comisdle(a, b);
313}
314
315static inline int __attribute__((__always_inline__, __nodebug__))
316_mm_comigt_sd(__m128d a, __m128d b)
317{
318  return __builtin_ia32_comisdgt(a, b);
319}
320
321static inline int __attribute__((__always_inline__, __nodebug__))
322_mm_comineq_sd(__m128d a, __m128d b)
323{
324  return __builtin_ia32_comisdneq(a, b);
325}
326
327static inline int __attribute__((__always_inline__, __nodebug__))
328_mm_ucomieq_sd(__m128d a, __m128d b)
329{
330  return __builtin_ia32_ucomisdeq(a, b);
331}
332
333static inline int __attribute__((__always_inline__, __nodebug__))
334_mm_ucomilt_sd(__m128d a, __m128d b)
335{
336  return __builtin_ia32_ucomisdlt(a, b);
337}
338
339static inline int __attribute__((__always_inline__, __nodebug__))
340_mm_ucomile_sd(__m128d a, __m128d b)
341{
342  return __builtin_ia32_ucomisdle(a, b);
343}
344
345static inline int __attribute__((__always_inline__, __nodebug__))
346_mm_ucomigt_sd(__m128d a, __m128d b)
347{
348  return __builtin_ia32_ucomisdgt(a, b);
349}
350
351static inline int __attribute__((__always_inline__, __nodebug__))
352_mm_ucomineq_sd(__m128d a, __m128d b)
353{
354  return __builtin_ia32_ucomisdneq(a, b);
355}
356
357static inline __m128 __attribute__((__always_inline__, __nodebug__))
358_mm_cvtpd_ps(__m128d a)
359{
360  return __builtin_ia32_cvtpd2ps(a);
361}
362
363static inline __m128d __attribute__((__always_inline__, __nodebug__))
364_mm_cvtps_pd(__m128 a)
365{
366  return __builtin_ia32_cvtps2pd(a);
367}
368
369static inline __m128d __attribute__((__always_inline__, __nodebug__))
370_mm_cvtepi32_pd(__m128i a)
371{
372  return __builtin_ia32_cvtdq2pd((__v4si)a);
373}
374
375static inline __m128i __attribute__((__always_inline__, __nodebug__))
376_mm_cvtpd_epi32(__m128d a)
377{
378  return __builtin_ia32_cvtpd2dq(a);
379}
380
381static inline int __attribute__((__always_inline__, __nodebug__))
382_mm_cvtsd_si32(__m128d a)
383{
384  return __builtin_ia32_cvtsd2si(a);
385}
386
387static inline __m128 __attribute__((__always_inline__, __nodebug__))
388_mm_cvtsd_ss(__m128 a, __m128d b)
389{
390  a[0] = b[0];
391  return a;
392}
393
394static inline __m128d __attribute__((__always_inline__, __nodebug__))
395_mm_cvtsi32_sd(__m128d a, int b)
396{
397  return __builtin_ia32_cvtsi2sd(a, b);
398}
399
400static inline __m128d __attribute__((__always_inline__, __nodebug__))
401_mm_cvtss_sd(__m128d a, __m128 b)
402{
403  a[0] = b[0];
404  return a;
405}
406
407static inline __m128i __attribute__((__always_inline__, __nodebug__))
408_mm_cvttpd_epi32(__m128d a)
409{
410  return (__m128i)__builtin_ia32_cvttpd2dq(a);
411}
412
413static inline int __attribute__((__always_inline__, __nodebug__))
414_mm_cvttsd_si32(__m128d a)
415{
416  return a[0];
417}
418
419static inline __m64 __attribute__((__always_inline__, __nodebug__))
420_mm_cvtpd_pi32(__m128d a)
421{
422  return (__m64)__builtin_ia32_cvtpd2pi(a);
423}
424
425static inline __m64 __attribute__((__always_inline__, __nodebug__))
426_mm_cvttpd_pi32(__m128d a)
427{
428  return (__m64)__builtin_ia32_cvttpd2pi(a);
429}
430
431static inline __m128d __attribute__((__always_inline__, __nodebug__))
432_mm_cvtpi32_pd(__m64 a)
433{
434  return __builtin_ia32_cvtpi2pd((__v2si)a);
435}
436
437static inline double __attribute__((__always_inline__, __nodebug__))
438_mm_cvtsd_f64(__m128d a)
439{
440  return a[0];
441}
442
443static inline __m128d __attribute__((__always_inline__, __nodebug__))
444_mm_load_pd(double const *dp)
445{
446  return *(__m128d*)dp;
447}
448
449static inline __m128d __attribute__((__always_inline__, __nodebug__))
450_mm_load1_pd(double const *dp)
451{
452  return (__m128d){ dp[0], dp[0] };
453}
454
455#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
456
457static inline __m128d __attribute__((__always_inline__, __nodebug__))
458_mm_loadr_pd(double const *dp)
459{
460  return (__m128d){ dp[1], dp[0] };
461}
462
463static inline __m128d __attribute__((__always_inline__, __nodebug__))
464_mm_loadu_pd(double const *dp)
465{
466  return __builtin_ia32_loadupd(dp);
467}
468
469static inline __m128d __attribute__((__always_inline__, __nodebug__))
470_mm_load_sd(double const *dp)
471{
472  return (__m128d){ *dp, 0.0 };
473}
474
475static inline __m128d __attribute__((__always_inline__, __nodebug__))
476_mm_loadh_pd(__m128d a, double const *dp)
477{
478  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
479}
480
481static inline __m128d __attribute__((__always_inline__, __nodebug__))
482_mm_loadl_pd(__m128d a, double const *dp)
483{
484  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
485}
486
487static inline __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_set_sd(double w)
489{
490  return (__m128d){ w, 0 };
491}
492
493static inline __m128d __attribute__((__always_inline__, __nodebug__))
494_mm_set1_pd(double w)
495{
496  return (__m128d){ w, w };
497}
498
499static inline __m128d __attribute__((__always_inline__, __nodebug__))
500_mm_set_pd(double w, double x)
501{
502  return (__m128d){ w, x };
503}
504
505static inline __m128d __attribute__((__always_inline__, __nodebug__))
506_mm_setr_pd(double w, double x)
507{
508  return (__m128d){ x, w };
509}
510
511static inline __m128d __attribute__((__always_inline__, __nodebug__))
512_mm_setzero_pd(void)
513{
514  return (__m128d){ 0, 0 };
515}
516
517static inline __m128d __attribute__((__always_inline__, __nodebug__))
518_mm_move_sd(__m128d a, __m128d b)
519{
520  return (__m128d){ b[0], a[1] };
521}
522
523static inline void __attribute__((__always_inline__, __nodebug__))
524_mm_store_sd(double *dp, __m128d a)
525{
526  dp[0] = a[0];
527}
528
529static inline void __attribute__((__always_inline__, __nodebug__))
530_mm_store1_pd(double *dp, __m128d a)
531{
532  dp[0] = a[0];
533  dp[1] = a[0];
534}
535
536static inline void __attribute__((__always_inline__, __nodebug__))
537_mm_store_pd(double *dp, __m128d a)
538{
539  *(__m128d *)dp = a;
540}
541
542static inline void __attribute__((__always_inline__, __nodebug__))
543_mm_storeu_pd(double *dp, __m128d a)
544{
545  __builtin_ia32_storeupd(dp, a);
546}
547
548static inline void __attribute__((__always_inline__, __nodebug__))
549_mm_storer_pd(double *dp, __m128d a)
550{
551  dp[0] = a[1];
552  dp[1] = a[0];
553}
554
555static inline void __attribute__((__always_inline__, __nodebug__))
556_mm_storeh_pd(double *dp, __m128d a)
557{
558  dp[0] = a[1];
559}
560
561static inline void __attribute__((__always_inline__, __nodebug__))
562_mm_storel_pd(double *dp, __m128d a)
563{
564  dp[0] = a[0];
565}
566
567static inline __m128i __attribute__((__always_inline__, __nodebug__))
568_mm_add_epi8(__m128i a, __m128i b)
569{
570  return (__m128i)((__v16qi)a + (__v16qi)b);
571}
572
573static inline __m128i __attribute__((__always_inline__, __nodebug__))
574_mm_add_epi16(__m128i a, __m128i b)
575{
576  return (__m128i)((__v8hi)a + (__v8hi)b);
577}
578
579static inline __m128i __attribute__((__always_inline__, __nodebug__))
580_mm_add_epi32(__m128i a, __m128i b)
581{
582  return (__m128i)((__v4si)a + (__v4si)b);
583}
584
585static inline __m64 __attribute__((__always_inline__, __nodebug__))
586_mm_add_si64(__m64 a, __m64 b)
587{
588  return a + b;
589}
590
591static inline __m128i __attribute__((__always_inline__, __nodebug__))
592_mm_add_epi64(__m128i a, __m128i b)
593{
594  return a + b;
595}
596
597static inline __m128i __attribute__((__always_inline__, __nodebug__))
598_mm_adds_epi8(__m128i a, __m128i b)
599{
600  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
601}
602
603static inline __m128i __attribute__((__always_inline__, __nodebug__))
604_mm_adds_epi16(__m128i a, __m128i b)
605{
606  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
607}
608
609static inline __m128i __attribute__((__always_inline__, __nodebug__))
610_mm_adds_epu8(__m128i a, __m128i b)
611{
612  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
613}
614
615static inline __m128i __attribute__((__always_inline__, __nodebug__))
616_mm_adds_epu16(__m128i a, __m128i b)
617{
618  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
619}
620
621static inline __m128i __attribute__((__always_inline__, __nodebug__))
622_mm_avg_epu8(__m128i a, __m128i b)
623{
624  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
625}
626
627static inline __m128i __attribute__((__always_inline__, __nodebug__))
628_mm_avg_epu16(__m128i a, __m128i b)
629{
630  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
631}
632
633static inline __m128i __attribute__((__always_inline__, __nodebug__))
634_mm_madd_epi16(__m128i a, __m128i b)
635{
636  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
637}
638
639static inline __m128i __attribute__((__always_inline__, __nodebug__))
640_mm_max_epi16(__m128i a, __m128i b)
641{
642  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
643}
644
645static inline __m128i __attribute__((__always_inline__, __nodebug__))
646_mm_max_epu8(__m128i a, __m128i b)
647{
648  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
649}
650
651static inline __m128i __attribute__((__always_inline__, __nodebug__))
652_mm_min_epi16(__m128i a, __m128i b)
653{
654  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
655}
656
657static inline __m128i __attribute__((__always_inline__, __nodebug__))
658_mm_min_epu8(__m128i a, __m128i b)
659{
660  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
661}
662
663static inline __m128i __attribute__((__always_inline__, __nodebug__))
664_mm_mulhi_epi16(__m128i a, __m128i b)
665{
666  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
667}
668
669static inline __m128i __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_epu16(__m128i a, __m128i b)
671{
672  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
673}
674
675static inline __m128i __attribute__((__always_inline__, __nodebug__))
676_mm_mullo_epi16(__m128i a, __m128i b)
677{
678  return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
679}
680
681static inline __m64 __attribute__((__always_inline__, __nodebug__))
682_mm_mul_su32(__m64 a, __m64 b)
683{
684  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
685}
686
687static inline __m128i __attribute__((__always_inline__, __nodebug__))
688_mm_mul_epu32(__m128i a, __m128i b)
689{
690  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
691}
692
693static inline __m128i __attribute__((__always_inline__, __nodebug__))
694_mm_sad_epu8(__m128i a, __m128i b)
695{
696  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
697}
698
699static inline __m128i __attribute__((__always_inline__, __nodebug__))
700_mm_sub_epi8(__m128i a, __m128i b)
701{
702  return (__m128i)((__v16qi)a - (__v16qi)b);
703}
704
705static inline __m128i __attribute__((__always_inline__, __nodebug__))
706_mm_sub_epi16(__m128i a, __m128i b)
707{
708  return (__m128i)((__v8hi)a - (__v8hi)b);
709}
710
711static inline __m128i __attribute__((__always_inline__, __nodebug__))
712_mm_sub_epi32(__m128i a, __m128i b)
713{
714  return (__m128i)((__v4si)a - (__v4si)b);
715}
716
717static inline __m64 __attribute__((__always_inline__, __nodebug__))
718_mm_sub_si64(__m64 a, __m64 b)
719{
720  return a - b;
721}
722
723static inline __m128i __attribute__((__always_inline__, __nodebug__))
724_mm_sub_epi64(__m128i a, __m128i b)
725{
726  return a - b;
727}
728
729static inline __m128i __attribute__((__always_inline__, __nodebug__))
730_mm_subs_epi8(__m128i a, __m128i b)
731{
732  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
733}
734
735static inline __m128i __attribute__((__always_inline__, __nodebug__))
736_mm_subs_epi16(__m128i a, __m128i b)
737{
738  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
739}
740
741static inline __m128i __attribute__((__always_inline__, __nodebug__))
742_mm_subs_epu8(__m128i a, __m128i b)
743{
744  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
745}
746
747static inline __m128i __attribute__((__always_inline__, __nodebug__))
748_mm_subs_epu16(__m128i a, __m128i b)
749{
750  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
751}
752
753static inline __m128i __attribute__((__always_inline__, __nodebug__))
754_mm_and_si128(__m128i a, __m128i b)
755{
756  return a & b;
757}
758
759static inline __m128i __attribute__((__always_inline__, __nodebug__))
760_mm_andnot_si128(__m128i a, __m128i b)
761{
762  return ~a & b;
763}
764
765static inline __m128i __attribute__((__always_inline__, __nodebug__))
766_mm_or_si128(__m128i a, __m128i b)
767{
768  return a | b;
769}
770
771static inline __m128i __attribute__((__always_inline__, __nodebug__))
772_mm_xor_si128(__m128i a, __m128i b)
773{
774  return a ^ b;
775}
776
777static inline __m128i __attribute__((__always_inline__, __nodebug__))
778_mm_slli_si128(__m128i a, int imm)
779{
780  return __builtin_ia32_pslldqi128(a, imm * 8);
781}
782
783static inline __m128i __attribute__((__always_inline__, __nodebug__))
784_mm_slli_epi16(__m128i a, int count)
785{
786  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
787}
788
789static inline __m128i __attribute__((__always_inline__, __nodebug__))
790_mm_sll_epi16(__m128i a, __m128i count)
791{
792  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
793}
794
795static inline __m128i __attribute__((__always_inline__, __nodebug__))
796_mm_slli_epi32(__m128i a, int count)
797{
798  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
799}
800
801static inline __m128i __attribute__((__always_inline__, __nodebug__))
802_mm_sll_epi32(__m128i a, __m128i count)
803{
804  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
805}
806
807static inline __m128i __attribute__((__always_inline__, __nodebug__))
808_mm_slli_epi64(__m128i a, int count)
809{
810  return __builtin_ia32_psllqi128(a, count);
811}
812
813static inline __m128i __attribute__((__always_inline__, __nodebug__))
814_mm_sll_epi64(__m128i a, __m128i count)
815{
816  return __builtin_ia32_psllq128(a, count);
817}
818
819static inline __m128i __attribute__((__always_inline__, __nodebug__))
820_mm_srai_epi16(__m128i a, int count)
821{
822  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
823}
824
825static inline __m128i __attribute__((__always_inline__, __nodebug__))
826_mm_sra_epi16(__m128i a, __m128i count)
827{
828  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
829}
830
831static inline __m128i __attribute__((__always_inline__, __nodebug__))
832_mm_srai_epi32(__m128i a, int count)
833{
834  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
835}
836
837static inline __m128i __attribute__((__always_inline__, __nodebug__))
838_mm_sra_epi32(__m128i a, __m128i count)
839{
840  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
841}
842
843static inline __m128i __attribute__((__always_inline__, __nodebug__))
844_mm_srli_si128(__m128i a, int imm)
845{
846  return __builtin_ia32_psrldqi128(a, imm * 8);
847}
848
849static inline __m128i __attribute__((__always_inline__, __nodebug__))
850_mm_srli_epi16(__m128i a, int count)
851{
852  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
853}
854
855static inline __m128i __attribute__((__always_inline__, __nodebug__))
856_mm_srl_epi16(__m128i a, __m128i count)
857{
858  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
859}
860
861static inline __m128i __attribute__((__always_inline__, __nodebug__))
862_mm_srli_epi32(__m128i a, int count)
863{
864  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
865}
866
867static inline __m128i __attribute__((__always_inline__, __nodebug__))
868_mm_srl_epi32(__m128i a, __m128i count)
869{
870  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
871}
872
873static inline __m128i __attribute__((__always_inline__, __nodebug__))
874_mm_srli_epi64(__m128i a, int count)
875{
876  return __builtin_ia32_psrlqi128(a, count);
877}
878
879static inline __m128i __attribute__((__always_inline__, __nodebug__))
880_mm_srl_epi64(__m128i a, __m128i count)
881{
882  return __builtin_ia32_psrlq128(a, count);
883}
884
885static inline __m128i __attribute__((__always_inline__, __nodebug__))
886_mm_cmpeq_epi8(__m128i a, __m128i b)
887{
888  return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
889}
890
891static inline __m128i __attribute__((__always_inline__, __nodebug__))
892_mm_cmpeq_epi16(__m128i a, __m128i b)
893{
894  return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
895}
896
897static inline __m128i __attribute__((__always_inline__, __nodebug__))
898_mm_cmpeq_epi32(__m128i a, __m128i b)
899{
900  return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
901}
902
903static inline __m128i __attribute__((__always_inline__, __nodebug__))
904_mm_cmpgt_epi8(__m128i a, __m128i b)
905{
906  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
907}
908
909static inline __m128i __attribute__((__always_inline__, __nodebug__))
910_mm_cmpgt_epi16(__m128i a, __m128i b)
911{
912  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
913}
914
915static inline __m128i __attribute__((__always_inline__, __nodebug__))
916_mm_cmpgt_epi32(__m128i a, __m128i b)
917{
918  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
919}
920
921static inline __m128i __attribute__((__always_inline__, __nodebug__))
922_mm_cmplt_epi8(__m128i a, __m128i b)
923{
924  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
925}
926
927static inline __m128i __attribute__((__always_inline__, __nodebug__))
928_mm_cmplt_epi16(__m128i a, __m128i b)
929{
930  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
931}
932
933static inline __m128i __attribute__((__always_inline__, __nodebug__))
934_mm_cmplt_epi32(__m128i a, __m128i b)
935{
936  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
937}
938
939#ifdef __x86_64__
940static inline __m128d __attribute__((__always_inline__, __nodebug__))
941_mm_cvtsi64_sd(__m128d a, long long b)
942{
943  a[0] = b;
944  return a;
945}
946
947static inline long long __attribute__((__always_inline__, __nodebug__))
948_mm_cvtsd_si64(__m128d a)
949{
950  return __builtin_ia32_cvtsd2si64(a);
951}
952
953static inline long long __attribute__((__always_inline__, __nodebug__))
954_mm_cvttsd_si64(__m128d a)
955{
956  return a[0];
957}
958#endif
959
960static inline __m128 __attribute__((__always_inline__, __nodebug__))
961_mm_cvtepi32_ps(__m128i a)
962{
963  return __builtin_ia32_cvtdq2ps((__v4si)a);
964}
965
966static inline __m128i __attribute__((__always_inline__, __nodebug__))
967_mm_cvtps_epi32(__m128 a)
968{
969  return (__m128i)__builtin_ia32_cvtps2dq(a);
970}
971
972static inline __m128i __attribute__((__always_inline__, __nodebug__))
973_mm_cvttps_epi32(__m128 a)
974{
975  return (__m128i)__builtin_ia32_cvttps2dq(a);
976}
977
978static inline __m128i __attribute__((__always_inline__, __nodebug__))
979_mm_cvtsi32_si128(int a)
980{
981  return (__m128i)(__v4si){ a, 0, 0, 0 };
982}
983
984#ifdef __x86_64__
985static inline __m128i __attribute__((__always_inline__, __nodebug__))
986_mm_cvtsi64_si128(long long a)
987{
988  return (__m128i){ a, 0 };
989}
990#endif
991
992static inline int __attribute__((__always_inline__, __nodebug__))
993_mm_cvtsi128_si32(__m128i a)
994{
995  __v4si b = (__v4si)a;
996  return b[0];
997}
998
999#ifdef __x86_64__
1000static inline long long __attribute__((__always_inline__, __nodebug__))
1001_mm_cvtsi128_si64(__m128i a)
1002{
1003  return a[0];
1004}
1005#endif
1006
1007static inline __m128i __attribute__((__always_inline__, __nodebug__))
1008_mm_load_si128(__m128i const *p)
1009{
1010  return *p;
1011}
1012
1013static inline __m128i __attribute__((__always_inline__, __nodebug__))
1014_mm_loadu_si128(__m128i const *p)
1015{
1016  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1017}
1018
1019static inline __m128i __attribute__((__always_inline__, __nodebug__))
1020_mm_loadl_epi64(__m128i const *p)
1021{
1022  return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
1023}
1024
1025static inline __m128i __attribute__((__always_inline__, __nodebug__))
1026_mm_set_epi64(__m64 q1, __m64 q0)
1027{
1028  return (__m128i){ (long long)q0, (long long)q1 };
1029}
1030
1031static inline __m128i __attribute__((__always_inline__, __nodebug__))
1032_mm_set_epi32(int i3, int i2, int i1, int i0)
1033{
1034  return (__m128i)(__v4si){ i0, i1, i2, i3};
1035}
1036
1037static inline __m128i __attribute__((__always_inline__, __nodebug__))
1038_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1039{
1040  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1041}
1042
1043static inline __m128i __attribute__((__always_inline__, __nodebug__))
1044_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1045{
1046  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1047}
1048
1049static inline __m128i __attribute__((__always_inline__, __nodebug__))
1050_mm_set1_epi64(__m64 q)
1051{
1052  return (__m128i){ (long long)q, (long long)q };
1053}
1054
1055static inline __m128i __attribute__((__always_inline__, __nodebug__))
1056_mm_set1_epi32(int i)
1057{
1058  return (__m128i)(__v4si){ i, i, i, i };
1059}
1060
1061static inline __m128i __attribute__((__always_inline__, __nodebug__))
1062_mm_set1_epi16(short w)
1063{
1064  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1065}
1066
1067static inline __m128i __attribute__((__always_inline__, __nodebug__))
1068_mm_set1_epi8(char b)
1069{
1070  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1071}
1072
1073static inline __m128i __attribute__((__always_inline__, __nodebug__))
1074_mm_setr_epi64(__m64 q0, __m64 q1)
1075{
1076  return (__m128i){ (long long)q0, (long long)q1 };
1077}
1078
1079static inline __m128i __attribute__((__always_inline__, __nodebug__))
1080_mm_setr_epi32(int i0, int i1, int i2, int i3)
1081{
1082  return (__m128i)(__v4si){ i0, i1, i2, i3};
1083}
1084
1085static inline __m128i __attribute__((__always_inline__, __nodebug__))
1086_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1087{
1088  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1089}
1090
1091static inline __m128i __attribute__((__always_inline__, __nodebug__))
1092_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1093{
1094  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1095}
1096
1097static inline __m128i __attribute__((__always_inline__, __nodebug__))
1098_mm_setzero_si128(void)
1099{
1100  return (__m128i){ 0LL, 0LL };
1101}
1102
1103static inline void __attribute__((__always_inline__, __nodebug__))
1104_mm_store_si128(__m128i *p, __m128i b)
1105{
1106  *p = b;
1107}
1108
1109static inline void __attribute__((__always_inline__, __nodebug__))
1110_mm_storeu_si128(__m128i *p, __m128i b)
1111{
1112  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1113}
1114
1115static inline void __attribute__((__always_inline__, __nodebug__))
1116_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1117{
1118  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1119}
1120
1121static inline void __attribute__((__always_inline__, __nodebug__))
1122_mm_storel_epi64(__m128i *p, __m128i a)
1123{
1124  __builtin_ia32_storelv4si((__v2si *)p, a);
1125}
1126
1127static inline void __attribute__((__always_inline__, __nodebug__))
1128_mm_stream_pd(double *p, __m128d a)
1129{
1130  __builtin_ia32_movntpd(p, a);
1131}
1132
1133static inline void __attribute__((__always_inline__, __nodebug__))
1134_mm_stream_si128(__m128i *p, __m128i a)
1135{
1136  __builtin_ia32_movntdq(p, a);
1137}
1138
1139static inline void __attribute__((__always_inline__, __nodebug__))
1140_mm_stream_si32(int *p, int a)
1141{
1142  __builtin_ia32_movnti(p, a);
1143}
1144
1145static inline void __attribute__((__always_inline__, __nodebug__))
1146_mm_clflush(void const *p)
1147{
1148  __builtin_ia32_clflush(p);
1149}
1150
1151static inline void __attribute__((__always_inline__, __nodebug__))
1152_mm_lfence(void)
1153{
1154  __builtin_ia32_lfence();
1155}
1156
1157static inline void __attribute__((__always_inline__, __nodebug__))
1158_mm_mfence(void)
1159{
1160  __builtin_ia32_mfence();
1161}
1162
1163static inline __m128i __attribute__((__always_inline__, __nodebug__))
1164_mm_packs_epi16(__m128i a, __m128i b)
1165{
1166  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1167}
1168
1169static inline __m128i __attribute__((__always_inline__, __nodebug__))
1170_mm_packs_epi32(__m128i a, __m128i b)
1171{
1172  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1173}
1174
1175static inline __m128i __attribute__((__always_inline__, __nodebug__))
1176_mm_packus_epi16(__m128i a, __m128i b)
1177{
1178  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1179}
1180
1181static inline int __attribute__((__always_inline__, __nodebug__))
1182_mm_extract_epi16(__m128i a, int imm)
1183{
1184  __v8hi b = (__v8hi)a;
1185  return b[imm];
1186}
1187
1188static inline __m128i __attribute__((__always_inline__, __nodebug__))
1189_mm_insert_epi16(__m128i a, int b, int imm)
1190{
1191  __v8hi c = (__v8hi)a;
1192  c[imm & 7] = b;
1193  return c;
1194}
1195
1196static inline int __attribute__((__always_inline__, __nodebug__))
1197_mm_movemask_epi8(__m128i a)
1198{
1199  return __builtin_ia32_pmovmskb128((__v16qi)a);
1200}
1201
1202#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm)))
1203#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
1204#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
1205
1206static inline __m128i __attribute__((__always_inline__, __nodebug__))
1207_mm_unpackhi_epi8(__m128i a, __m128i b)
1208{
1209  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1210}
1211
1212static inline __m128i __attribute__((__always_inline__, __nodebug__))
1213_mm_unpackhi_epi16(__m128i a, __m128i b)
1214{
1215  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1216}
1217
1218static inline __m128i __attribute__((__always_inline__, __nodebug__))
1219_mm_unpackhi_epi32(__m128i a, __m128i b)
1220{
1221  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1222}
1223
1224static inline __m128i __attribute__((__always_inline__, __nodebug__))
1225_mm_unpackhi_epi64(__m128i a, __m128i b)
1226{
1227  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1228}
1229
1230static inline __m128i __attribute__((__always_inline__, __nodebug__))
1231_mm_unpacklo_epi8(__m128i a, __m128i b)
1232{
1233  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1234}
1235
1236static inline __m128i __attribute__((__always_inline__, __nodebug__))
1237_mm_unpacklo_epi16(__m128i a, __m128i b)
1238{
1239  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1240}
1241
1242static inline __m128i __attribute__((__always_inline__, __nodebug__))
1243_mm_unpacklo_epi32(__m128i a, __m128i b)
1244{
1245  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1246}
1247
1248static inline __m128i __attribute__((__always_inline__, __nodebug__))
1249_mm_unpacklo_epi64(__m128i a, __m128i b)
1250{
1251  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1252}
1253
1254static inline __m64 __attribute__((__always_inline__, __nodebug__))
1255_mm_movepi64_pi64(__m128i a)
1256{
1257  return (__m64)a[0];
1258}
1259
1260static inline __m128i __attribute__((__always_inline__, __nodebug__))
1261_mm_movpi64_pi64(__m64 a)
1262{
1263  return (__m128i){ (long long)a, 0 };
1264}
1265
1266static inline __m128i __attribute__((__always_inline__, __nodebug__))
1267_mm_move_epi64(__m128i a)
1268{
1269  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1270}
1271
1272static inline __m128d __attribute__((__always_inline__, __nodebug__))
1273_mm_unpackhi_pd(__m128d a, __m128d b)
1274{
1275  return __builtin_shufflevector(a, b, 1, 2+1);
1276}
1277
1278static inline __m128d __attribute__((__always_inline__, __nodebug__))
1279_mm_unpacklo_pd(__m128d a, __m128d b)
1280{
1281  return __builtin_shufflevector(a, b, 0, 2+0);
1282}
1283
1284static inline int __attribute__((__always_inline__, __nodebug__))
1285_mm_movemask_pd(__m128d a)
1286{
1287  return __builtin_ia32_movmskpd(a);
1288}
1289
1290#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
1291
1292static inline __m128 __attribute__((__always_inline__, __nodebug__))
1293_mm_castpd_ps(__m128d in)
1294{
1295  return (__m128)in;
1296}
1297
1298static inline __m128i __attribute__((__always_inline__, __nodebug__))
1299_mm_castpd_si128(__m128d in)
1300{
1301  return (__m128i)in;
1302}
1303
1304static inline __m128d __attribute__((__always_inline__, __nodebug__))
1305_mm_castps_pd(__m128 in)
1306{
1307  return (__m128d)in;
1308}
1309
1310static inline __m128i __attribute__((__always_inline__, __nodebug__))
1311_mm_castps_si128(__m128 in)
1312{
1313  return (__m128i)in;
1314}
1315
1316static inline __m128 __attribute__((__always_inline__, __nodebug__))
1317_mm_castsi128_ps(__m128i in)
1318{
1319  return (__m128)in;
1320}
1321
1322static inline __m128d __attribute__((__always_inline__, __nodebug__))
1323_mm_castsi128_pd(__m128i in)
1324{
1325  return (__m128d)in;
1326}
1327
1328static inline void __attribute__((__always_inline__, __nodebug__))
1329_mm_pause(void)
1330{
1331  __asm__ volatile ("pause");
1332}
1333
1334#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1335
1336#endif /* __SSE2__ */
1337
1338#endif /* __EMMINTRIN_H */
1339