emmintrin.h revision 4fd3e63cb043cbd140a3e8028374bd2e4312b90e
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef int __v4si __attribute__((__vector_size__(16)));
37typedef short __v8hi __attribute__((__vector_size__(16)));
38typedef char __v16qi __attribute__((__vector_size__(16)));
39
40static inline __m128d __attribute__((__always_inline__)) _mm_add_sd(__m128d a, __m128d b)
41{
42  return __builtin_ia32_addsd(a, b);
43}
44
45static inline __m128d __attribute__((__always_inline__)) _mm_add_pd(__m128d a, __m128d b)
46{
47  return a + b;
48}
49
50static inline __m128d __attribute__((__always_inline__)) _mm_sub_sd(__m128d a, __m128d b)
51{
52  return __builtin_ia32_subsd(a, b);
53}
54
55static inline __m128d __attribute__((__always_inline__)) _mm_sub_pd(__m128d a, __m128d b)
56{
57  return a - b;
58}
59
60static inline __m128d __attribute__((__always_inline__)) _mm_mul_sd(__m128d a, __m128d b)
61{
62  return __builtin_ia32_mulsd(a, b);
63}
64
65static inline __m128d __attribute__((__always_inline__)) _mm_mul_pd(__m128d a, __m128d b)
66{
67  return a * b;
68}
69
70static inline __m128d __attribute__((__always_inline__)) _mm_div_sd(__m128d a, __m128d b)
71{
72  return __builtin_ia32_divsd(a, b);
73}
74
75static inline __m128d __attribute__((__always_inline__)) _mm_div_pd(__m128d a, __m128d b)
76{
77  return a / b;
78}
79
80static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_sd(__m128d a, __m128d b)
81{
82  __m128d c = __builtin_ia32_sqrtsd(b);
83  return (__m128d) { c[0], a[1] };
84}
85
86static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_pd(__m128d a)
87{
88  return __builtin_ia32_sqrtpd(a);
89}
90
91static inline __m128d __attribute__((__always_inline__)) _mm_min_sd(__m128d a, __m128d b)
92{
93  return __builtin_ia32_minsd(a, b);
94}
95
96static inline __m128d __attribute__((__always_inline__)) _mm_min_pd(__m128d a, __m128d b)
97{
98  return __builtin_ia32_minpd(a, b);
99}
100
101static inline __m128d __attribute__((__always_inline__)) _mm_max_sd(__m128d a, __m128d b)
102{
103  return __builtin_ia32_maxsd(a, b);
104}
105
106static inline __m128d __attribute__((__always_inline__)) _mm_max_pd(__m128d a, __m128d b)
107{
108  return __builtin_ia32_maxpd(a, b);
109}
110
111static inline __m128d __attribute__((__always_inline__)) _mm_and_pd(__m128d a, __m128d b)
112{
113  return __builtin_ia32_andpd(a, b);
114}
115
116static inline __m128d __attribute__((__always_inline__)) _mm_andnot_pd(__m128d a, __m128d b)
117{
118  return __builtin_ia32_andnpd(a, b);
119}
120
121static inline __m128d __attribute__((__always_inline__)) _mm_or_pd(__m128d a, __m128d b)
122{
123  return __builtin_ia32_orpd(a, b);
124}
125
126static inline __m128d __attribute__((__always_inline__)) _mm_xor_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_xorpd(a, b);
129}
130
131static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_pd(__m128d a, __m128d b)
132{
133  return (__m128d)__builtin_ia32_cmpeqpd(a, b);
134}
135
136static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_pd(__m128d a, __m128d b)
137{
138  return (__m128d)__builtin_ia32_cmpltpd(a, b);
139}
140
141static inline __m128d __attribute__((__always_inline__)) _mm_cmple_pd(__m128d a, __m128d b)
142{
143  return (__m128d)__builtin_ia32_cmplepd(a, b);
144}
145
146static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_pd(__m128d a, __m128d b)
147{
148  return (__m128d)__builtin_ia32_cmpltpd(b, a);
149}
150
151static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_pd(__m128d a, __m128d b)
152{
153  return (__m128d)__builtin_ia32_cmplepd(b, a);
154}
155
156static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmpordpd(a, b);
159}
160
161static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_pd(__m128d a, __m128d b)
162{
163  return (__m128d)__builtin_ia32_cmpunordpd(a, b);
164}
165
166static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_pd(__m128d a, __m128d b)
167{
168  return (__m128d)__builtin_ia32_cmpneqpd(a, b);
169}
170
171static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_pd(__m128d a, __m128d b)
172{
173  return (__m128d)__builtin_ia32_cmpnltpd(a, b);
174}
175
176static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_pd(__m128d a, __m128d b)
177{
178  return (__m128d)__builtin_ia32_cmpnlepd(a, b);
179}
180
181static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_pd(__m128d a, __m128d b)
182{
183  return (__m128d)__builtin_ia32_cmpnltpd(b, a);
184}
185
186static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmpnlepd(b, a);
189}
190
191static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_sd(__m128d a, __m128d b)
192{
193  return (__m128d)__builtin_ia32_cmpeqsd(a, b);
194}
195
196static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_sd(__m128d a, __m128d b)
197{
198  return (__m128d)__builtin_ia32_cmpltsd(a, b);
199}
200
201static inline __m128d __attribute__((__always_inline__)) _mm_cmple_sd(__m128d a, __m128d b)
202{
203  return (__m128d)__builtin_ia32_cmplesd(a, b);
204}
205
206static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_sd(__m128d a, __m128d b)
207{
208  return (__m128d)__builtin_ia32_cmpltsd(b, a);
209}
210
211static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_sd(__m128d a, __m128d b)
212{
213  return (__m128d)__builtin_ia32_cmplesd(b, a);
214}
215
216static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_sd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmpordsd(a, b);
219}
220
221static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_sd(__m128d a, __m128d b)
222{
223  return (__m128d)__builtin_ia32_cmpunordsd(a, b);
224}
225
226static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_sd(__m128d a, __m128d b)
227{
228  return (__m128d)__builtin_ia32_cmpneqsd(a, b);
229}
230
231static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_sd(__m128d a, __m128d b)
232{
233  return (__m128d)__builtin_ia32_cmpnltsd(a, b);
234}
235
236static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_sd(__m128d a, __m128d b)
237{
238  return (__m128d)__builtin_ia32_cmpnlesd(a, b);
239}
240
241static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_sd(__m128d a, __m128d b)
242{
243  return (__m128d)__builtin_ia32_cmpnltsd(b, a);
244}
245
246static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpnlesd(b, a);
249}
250
251static inline int __attribute__((__always_inline__)) _mm_comieq_sd(__m128d a, __m128d b)
252{
253  return __builtin_ia32_comisdeq(a, b);
254}
255
256static inline int __attribute__((__always_inline__)) _mm_comilt_sd(__m128d a, __m128d b)
257{
258  return __builtin_ia32_comisdlt(a, b);
259}
260
261static inline int __attribute__((__always_inline__)) _mm_comile_sd(__m128d a, __m128d b)
262{
263  return __builtin_ia32_comisdle(a, b);
264}
265
266static inline int __attribute__((__always_inline__)) _mm_comigt_sd(__m128d a, __m128d b)
267{
268  return __builtin_ia32_comisdgt(a, b);
269}
270
271static inline int __attribute__((__always_inline__)) _mm_comineq_sd(__m128d a, __m128d b)
272{
273  return __builtin_ia32_comisdneq(a, b);
274}
275
276static inline int __attribute__((__always_inline__)) _mm_ucomieq_sd(__m128d a, __m128d b)
277{
278  return __builtin_ia32_ucomisdeq(a, b);
279}
280
281static inline int __attribute__((__always_inline__)) _mm_ucomilt_sd(__m128d a, __m128d b)
282{
283  return __builtin_ia32_ucomisdlt(a, b);
284}
285
286static inline int __attribute__((__always_inline__)) _mm_ucomile_sd(__m128d a, __m128d b)
287{
288  return __builtin_ia32_ucomisdle(a, b);
289}
290
291static inline int __attribute__((__always_inline__)) _mm_ucomigt_sd(__m128d a, __m128d b)
292{
293  return __builtin_ia32_ucomisdgt(a, b);
294}
295
296static inline int __attribute__((__always_inline__)) _mm_ucomineq_sd(__m128d a, __m128d b)
297{
298  return __builtin_ia32_ucomisdneq(a, b);
299}
300
301static inline __m128 __attribute__((__always_inline__)) _mm_cvtpd_ps(__m128d a)
302{
303  return __builtin_ia32_cvtpd2ps(a);
304}
305
306static inline __m128d __attribute__((__always_inline__)) _mm_cvtps_pd(__m128 a)
307{
308  return __builtin_ia32_cvtps2pd(a);
309}
310
311static inline __m128d __attribute__((__always_inline__)) _mm_cvtepi32_pd(__m128i a)
312{
313  return __builtin_ia32_cvtdq2pd((__v4si)a);
314}
315
316static inline __m128i __attribute__((__always_inline__)) _mm_cvtpd_epi32(__m128d a)
317{
318  return __builtin_ia32_cvtpd2dq(a);
319}
320
321static inline int __attribute__((__always_inline__)) _mm_cvtsd_si32(__m128d a)
322{
323  return __builtin_ia32_cvtsd2si(a);
324}
325
326static inline __m128 __attribute__((__always_inline__)) _mm_cvtsd_ss(__m128 a, __m128d b)
327{
328  return __builtin_ia32_cvtsd2ss(a, b);
329}
330
331static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi32_sd(__m128d a, int b)
332{
333  return __builtin_ia32_cvtsi2sd(a, b);
334}
335
336static inline __m128d __attribute__((__always_inline__)) _mm_cvtss_sd(__m128d a, __m128 b)
337{
338  return __builtin_ia32_cvtss2sd(a, b);
339}
340
341static inline __m128i __attribute__((__always_inline__)) _mm_cvttpd_epi32(__m128d a)
342{
343  return (__m128i)__builtin_ia32_cvttpd2dq(a);
344}
345
346static inline int __attribute__((__always_inline__)) _mm_cvttsd_si32(__m128d a)
347{
348  return __builtin_ia32_cvttsd2si(a);
349}
350
351static inline __m64 __attribute__((__always_inline__)) _mm_cvtpd_pi32(__m128d a)
352{
353  return (__m64)__builtin_ia32_cvtpd2pi(a);
354}
355
356static inline __m64 __attribute__((__always_inline__)) _mm_cvttpd_pi32(__m128d a)
357{
358  return (__m64)__builtin_ia32_cvttpd2pi(a);
359}
360
361static inline __m128d __attribute__((__always_inline__)) _mm_cvtpi32_pd(__m64 a)
362{
363  return __builtin_ia32_cvtpi2pd((__v2si)a);
364}
365
366static inline double __attribute__((__always_inline__)) _mm_cvtsd_f64(__m128d a)
367{
368  return a[0];
369}
370
371static inline __m128d __attribute__((__always_inline__)) _mm_load_pd(double const *dp)
372{
373  return *(__m128d*)dp;
374}
375
376static inline __m128d __attribute__((__always_inline__)) _mm_load1_pd(double const *dp)
377{
378  return (__m128d){ dp[0], dp[0] };
379}
380
381static inline __m128d __attribute__((__always_inline__)) _mm_loadr_pd(double const *dp)
382{
383  return (__m128d){ dp[1], dp[0] };
384}
385
386static inline __m128d __attribute__((__always_inline__)) _mm_loadu_pd(double const *dp)
387{
388  return __builtin_ia32_loadupd(dp);
389}
390
391static inline __m128d __attribute__((__always_inline__)) _mm_load_sd(double const *dp)
392{
393  return (__m128d){ *dp, 0.0 };
394}
395
396static inline __m128d __attribute__((__always_inline__)) _mm_loadh_pd(__m128d a, double const *dp)
397{
398  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
399}
400
401static inline __m128d __attribute__((__always_inline__)) _mm_loadl_pd(__m128d a, double const *dp)
402{
403  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
404}
405
406static inline __m128d __attribute__((__always_inline__)) _mm_set_sd(double w)
407{
408  return (__m128d){ w, 0 };
409}
410
411static inline __m128d __attribute__((__always_inline__)) _mm_set1_pd(double w)
412{
413  return (__m128d){ w, w };
414}
415
416static inline __m128d __attribute__((__always_inline__)) _mm_set_pd(double w, double x)
417{
418  return (__m128d){ w, x };
419}
420
421static inline __m128d __attribute__((__always_inline__)) _mm_setr_pd(double w, double x)
422{
423  return (__m128d){ x, w };
424}
425
426static inline __m128d __attribute__((__always_inline__)) _mm_setzero_pd(void)
427{
428  return (__m128d){ 0, 0 };
429}
430
431static inline __m128d __attribute__((__always_inline__)) _mm_move_sd(__m128d a, __m128d b)
432{
433  return (__m128d){ b[0], a[1] };
434}
435
436static inline void __attribute__((__always_inline__)) _mm_store_sd(double *dp, __m128d a)
437{
438  dp[0] = a[0];
439}
440
441static inline void __attribute__((__always_inline__)) _mm_store1_pd(double *dp, __m128d a)
442{
443  dp[0] = a[0];
444  dp[1] = a[0];
445}
446
447static inline void __attribute__((__always_inline__)) _mm_store_pd(double *dp, __m128d a)
448{
449  *(__m128d *)dp = a;
450}
451
452static inline void __attribute__((__always_inline__)) _mm_storeu_pd(double *dp, __m128d a)
453{
454  __builtin_ia32_storeupd(dp, a);
455}
456
457static inline void __attribute__((__always_inline__)) _mm_storer_pd(double *dp, __m128d a)
458{
459  dp[0] = a[1];
460  dp[1] = a[0];
461}
462
463static inline void __attribute__((__always_inline__)) _mm_storeh_pd(double *dp, __m128d a)
464{
465  dp[0] = a[1];
466}
467
468static inline void __attribute__((__always_inline__)) _mm_storel_pd(double *dp, __m128d a)
469{
470  dp[0] = a[0];
471}
472
473static inline __m128i __attribute__((__always_inline__)) _mm_add_epi8(__m128i a, __m128i b)
474{
475  return (__m128i)((__v16qi)a + (__v16qi)b);
476}
477
478static inline __m128i __attribute__((__always_inline__)) _mm_add_epi16(__m128i a, __m128i b)
479{
480  return (__m128i)((__v8hi)a + (__v8hi)b);
481}
482
483static inline __m128i __attribute__((__always_inline__)) _mm_add_epi32(__m128i a, __m128i b)
484{
485  return (__m128i)((__v4si)a + (__v4si)b);
486}
487
488static inline __m64 __attribute__((__always_inline__)) _mm_add_si64(__m64 a, __m64 b)
489{
490  return a + b;
491}
492
493static inline __m128i __attribute__((__always_inline__)) _mm_add_epi64(__m128i a, __m128i b)
494{
495  return a + b;
496}
497
498static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi8(__m128i a, __m128i b)
499{
500  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
501}
502
503static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi16(__m128i a, __m128i b)
504{
505  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
506}
507
508static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu8(__m128i a, __m128i b)
509{
510  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
511}
512
513static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu16(__m128i a, __m128i b)
514{
515  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
516}
517
518static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu8(__m128i a, __m128i b)
519{
520  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
521}
522
523static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu16(__m128i a, __m128i b)
524{
525  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
526}
527
528static inline __m128i __attribute__((__always_inline__)) _mm_madd_epi16(__m128i a, __m128i b)
529{
530  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
531}
532
533static inline __m128i __attribute__((__always_inline__)) _mm_max_epi16(__m128i a, __m128i b)
534{
535  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
536}
537
538static inline __m128i __attribute__((__always_inline__)) _mm_max_epu8(__m128i a, __m128i b)
539{
540  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
541}
542
543static inline __m128i __attribute__((__always_inline__)) _mm_min_epi16(__m128i a, __m128i b)
544{
545  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
546}
547
548static inline __m128i __attribute__((__always_inline__)) _mm_min_epu8(__m128i a, __m128i b)
549{
550  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
551}
552
553static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epi16(__m128i a, __m128i b)
554{
555  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
556}
557
558static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epu16(__m128i a, __m128i b)
559{
560  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
561}
562
563static inline __m128i __attribute__((__always_inline__)) _mm_mullo_epi16(__m128i a, __m128i b)
564{
565  return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
566}
567
568static inline __m64 __attribute__((__always_inline__)) _mm_mul_su32(__m64 a, __m64 b)
569{
570  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
571}
572
573static inline __m128i __attribute__((__always_inline__)) _mm_mul_epu32(__m128i a, __m128i b)
574{
575  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
576}
577
578static inline __m128i __attribute__((__always_inline__)) _mm_sad_epu(__m128i a, __m128i b)
579{
580  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
581}
582
583static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi8(__m128i a, __m128i b)
584{
585  return (__m128i)((__v16qi)a - (__v16qi)b);
586}
587
588static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi16(__m128i a, __m128i b)
589{
590  return (__m128i)((__v8hi)a - (__v8hi)b);
591}
592
593static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi32(__m128i a, __m128i b)
594{
595  return (__m128i)((__v4si)a - (__v4si)b);
596}
597
598static inline __m64 __attribute__((__always_inline__)) _mm_sub_si64(__m64 a, __m64 b)
599{
600  return a - b;
601}
602
603static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi64(__m128i a, __m128i b)
604{
605  return a - b;
606}
607
608static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi8(__m128i a, __m128i b)
609{
610  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
611}
612
613static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi16(__m128i a, __m128i b)
614{
615  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
616}
617
618static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu8(__m128i a, __m128i b)
619{
620  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
621}
622
623static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu16(__m128i a, __m128i b)
624{
625  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
626}
627
628static inline __m128i __attribute__((__always_inline__)) _mm_and_si128(__m128i a, __m128i b)
629{
630  return __builtin_ia32_pand128(a, b);
631}
632
633static inline __m128i __attribute__((__always_inline__)) _mm_andnot_si128(__m128i a, __m128i b)
634{
635  return __builtin_ia32_pandn128(a, b);
636}
637
638static inline __m128i __attribute__((__always_inline__)) _mm_or_si128(__m128i a, __m128i b)
639{
640  return __builtin_ia32_por128(a, b);
641}
642
643static inline __m128i __attribute__((__always_inline__)) _mm_xor_si128(__m128i a, __m128i b)
644{
645  return __builtin_ia32_pxor128(a, b);
646}
647
648static inline __m128i __attribute__((__always_inline__)) _mm_slli_si128(__m128i a, int imm)
649{
650  return __builtin_ia32_pslldqi128(a, imm * 8);
651}
652
653static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi16(__m128i a, int count)
654{
655  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
656}
657
658static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi16(__m128i a, __m128i count)
659{
660  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
661}
662
663static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32(__m128i a, int count)
664{
665  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
666}
667
668static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi32(__m128i a, __m128i count)
669{
670  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
671}
672
673static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi64(__m128i a, int count)
674{
675  return __builtin_ia32_psllqi128(a, count);
676}
677
678static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi64(__m128i a, __m128i count)
679{
680  return __builtin_ia32_psllq128(a, count);
681}
682
683static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi16(__m128i a, int count)
684{
685  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
686}
687
688static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi16(__m128i a, __m128i count)
689{
690  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
691}
692
693static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi32(__m128i a, int count)
694{
695  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
696}
697
698static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi32(__m128i a, __m128i count)
699{
700  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
701}
702
703static inline __m128i __attribute__((__always_inline__)) _mm_srli_si128(__m128i a, int imm)
704{
705  return __builtin_ia32_psrldqi128(a, imm * 8);
706}
707
708static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi16(__m128i a, int count)
709{
710  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
711}
712
713static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi16(__m128i a, __m128i count)
714{
715  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
716}
717
718static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi32(__m128i a, int count)
719{
720  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
721}
722
723static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi32(__m128i a, __m128i count)
724{
725  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
726}
727
728static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi64(__m128i a, int count)
729{
730  return __builtin_ia32_psrlqi128(a, count);
731}
732
733static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi64(__m128i a, __m128i count)
734{
735  return __builtin_ia32_psrlq128(a, count);
736}
737
738static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi8(__m128i a, __m128i b)
739{
740  return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
741}
742
743static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi16(__m128i a, __m128i b)
744{
745  return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
746}
747
748static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi32(__m128i a, __m128i b)
749{
750  return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
751}
752
753static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi8(__m128i a, __m128i b)
754{
755  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
756}
757
758static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi16(__m128i a, __m128i b)
759{
760  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
761}
762
763static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi32(__m128i a, __m128i b)
764{
765  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
766}
767
768static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi8(__m128i a, __m128i b)
769{
770  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
771}
772
773static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi16(__m128i a, __m128i b)
774{
775  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
776}
777
778static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi32(__m128i a, __m128i b)
779{
780  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
781}
782
783#ifdef __x86_64__
784static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi64_sd(__m128d a, long long b)
785{
786  return __builtin_ia32_cvtsi642sd(a, b);
787}
788
789static inline long long __attribute__((__always_inline__)) _mm_cvtsd_si64(__m128d a)
790{
791  return __builtin_ia32_cvtsd2si64(a);
792}
793
794static inline long long __attribute__((__always_inline__)) _mm_cvttsd_si64(__m128d a)
795{
796  return __builtin_ia32_cvttsd2si64(a);
797}
798#endif
799
800static inline __m128 __attribute__((__always_inline__)) _mm_cvtepi32_ps(__m128i a)
801{
802  return __builtin_ia32_cvtdq2ps((__v4si)a);
803}
804
805static inline __m128i __attribute__((__always_inline__)) _mm_cvtps_epi32(__m128 a)
806{
807  return (__m128i)__builtin_ia32_cvtps2dq(a);
808}
809
810static inline __m128i __attribute__((__always_inline__)) _mm_cvttps_epi32(__m128 a)
811{
812  return (__m128i)__builtin_ia32_cvttps2dq(a);
813}
814
815static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi32_si128(int a)
816{
817  return (__m128i)(__v4si){ a, 0, 0, 0 };
818}
819
820#ifdef __x86_64__
821static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi64_si128(long long a)
822{
823  return (__m128i){ a, 0 };
824}
825#endif
826
827static inline int __attribute__((__always_inline__)) _mm_cvtsi128_si32(__m128i a)
828{
829  __v4si b = (__v4si)a;
830  return b[0];
831}
832
833#ifdef __x86_64__
834static inline long long __attribute__((__always_inline__)) _mm_cvtsi128_si64(__m128i a)
835{
836  return a[0];
837}
838#endif
839
840static inline __m128i __attribute__((__always_inline__)) _mm_load_si128(__m128i const *p)
841{
842  return *p;
843}
844
845static inline __m128i __attribute__((__always_inline__)) _mm_loadu_si128(__m128i const *p)
846{
847  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
848}
849
850static inline __m128i __attribute__((__always_inline__)) _mm_loadl_epi64(__m128i const *p)
851{
852  return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
853}
854
855static inline __m128i __attribute__((__always_inline__)) _mm_set_epi64(__m64 q1, __m64 q0)
856{
857  return (__m128i){ (long long)q0, (long long)q1 };
858}
859
860static inline __m128i __attribute__((__always_inline__)) _mm_set_epi32(int i3, int i2, int i1, int i0)
861{
862  return (__m128i)(__v4si){ i0, i1, i2, i3};
863}
864
865static inline __m128i __attribute__((__always_inline__)) _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
866{
867  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
868}
869
870static inline __m128i __attribute__((__always_inline__)) _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
871{
872  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
873}
874
875static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi64(__m64 q)
876{
877  return (__m128i){ (long long)q, (long long)q };
878}
879
880static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi32(int i)
881{
882  return (__m128i)(__v4si){ i, i, i, i };
883}
884
885static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi16(short w)
886{
887  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
888}
889
890static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi8(char b)
891{
892  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
893}
894
895static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi64(__m64 q0, __m64 q1)
896{
897  return (__m128i){ (long long)q0, (long long)q1 };
898}
899
900static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi32(int i0, int i1, int i2, int i3)
901{
902  return (__m128i)(__v4si){ i0, i1, i2, i3};
903}
904
905static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
906{
907  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
908}
909
910static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
911{
912  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
913}
914
915static inline __m128i __attribute__((__always_inline__)) _mm_setzero_si128(void)
916{
917  return (__m128i){ 0LL, 0LL };
918}
919
920static inline void __attribute__((__always_inline__)) _mm_store_si128(__m128i *p, __m128i b)
921{
922  *p = b;
923}
924
925static inline void __attribute__((__always_inline__)) _mm_storeu_si128(__m128i *p, __m128i b)
926{
927  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
928}
929
930static inline void __attribute__((__always_inline__)) _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
931{
932  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
933}
934
935static inline void __attribute__((__always_inline__)) _mm_storel_epi64(__m128i *p, __m128i a)
936{
937  __builtin_ia32_storelv4si((__v2si *)p, a);
938}
939
940static inline void __attribute__((__always_inline__)) _mm_stream_pd(double *p, __m128d a)
941{
942  __builtin_ia32_movntpd(p, a);
943}
944
945static inline void __attribute__((__always_inline__)) _mm_stream_si128(__m128i *p, __m128i a)
946{
947  __builtin_ia32_movntdq(p, a);
948}
949
950static inline void __attribute__((__always_inline__)) _mm_stream_si32(int *p, int a)
951{
952  __builtin_ia32_movnti(p, a);
953}
954
955static inline void __attribute__((__always_inline__)) _mm_clflush(void const *p)
956{
957  __builtin_ia32_clflush(p);
958}
959
960static inline void __attribute__((__always_inline__)) _mm_lfence(void)
961{
962  __builtin_ia32_lfence();
963}
964
965static inline void __attribute__((__always_inline__)) _mm_mfence(void)
966{
967  __builtin_ia32_mfence();
968}
969
970static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi16(__m128i a, __m128i b)
971{
972  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
973}
974
975static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi32(__m128i a, __m128i b)
976{
977  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
978}
979
980static inline __m128i __attribute__((__always_inline__)) _mm_packus_epi16(__m128i a, __m128i b)
981{
982  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
983}
984
985static inline int __attribute__((__always_inline__)) _mm_extract_epi16(__m128i a, int imm)
986{
987  __v8hi b = (__v8hi)a;
988  return b[imm];
989}
990
991static inline __m128i __attribute__((__always_inline__)) _mm_insert_epi16(__m128i a, int b, int imm)
992{
993  return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
994}
995
996static inline int __attribute__((__always_inline__)) _mm_movemask_epi8(__m128i a)
997{
998  return __builtin_ia32_pmovmskb128((__v16qi)a);
999}
1000
1001#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm)))
1002#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
1003#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
1004
1005static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi8(__m128i a, __m128i b)
1006{
1007  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1008}
1009
1010static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi16(__m128i a, __m128i b)
1011{
1012  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1013}
1014
1015static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi32(__m128i a, __m128i b)
1016{
1017  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1018}
1019
1020static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi64(__m128i a, __m128i b)
1021{
1022  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1023}
1024
1025static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi8(__m128i a, __m128i b)
1026{
1027  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1028}
1029
1030static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi16(__m128i a, __m128i b)
1031{
1032  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1033}
1034
1035static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi32(__m128i a, __m128i b)
1036{
1037  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1038}
1039
1040static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi64(__m128i a, __m128i b)
1041{
1042  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1043}
1044
1045static inline __m64 __attribute__((__always_inline__)) _mm_movepi64_pi64(__m128i a)
1046{
1047  return (__m64)a[0];
1048}
1049
1050static inline __m128i __attribute__((__always_inline__)) _mm_movpi64_pi64(__m64 a)
1051{
1052  return (__m128i){ (long long)a, 0 };
1053}
1054
1055static inline __m128i __attribute__((__always_inline__)) _mm_move_epi64(__m128i a)
1056{
1057  return (__m128i){ a[0], 0 };
1058}
1059
1060static inline __m128d __attribute__((__always_inline__)) _mm_unpackhi_pd(__m128d a, __m128d b)
1061{
1062  return __builtin_shufflevector(a, b, 1, 2+1);
1063}
1064
1065static inline __m128d __attribute__((__always_inline__)) _mm_unpacklo_pd(__m128d a, __m128d b)
1066{
1067  return __builtin_shufflevector(a, b, 0, 2+0);
1068}
1069
1070static inline int __attribute__((__always_inline__)) _mm_movemask_pd(__m128d a)
1071{
1072  return __builtin_ia32_movmskpd(a);
1073}
1074
1075#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
1076
1077static inline __m128 __attribute__((__always_inline__)) _mm_castpd_ps(__m128d in)
1078{
1079  return (__m128)in;
1080}
1081
1082static inline __m128i __attribute__((__always_inline__)) _mm_castpd_si128(__m128d in)
1083{
1084  return (__m128i)in;
1085}
1086
1087static inline __m128d __attribute__((__always_inline__)) _mm_castps_pd(__m128 in)
1088{
1089  return (__m128d)in;
1090}
1091
1092static inline __m128i __attribute__((__always_inline__)) _mm_castps_si128(__m128 in)
1093{
1094  return (__m128i)in;
1095}
1096
1097static inline __m128 __attribute__((__always_inline__)) _mm_castsi128_ps(__m128i in)
1098{
1099  return (__m128)in;
1100}
1101
1102static inline __m128d __attribute__((__always_inline__)) _mm_castsi128_pd(__m128i in)
1103{
1104  return (__m128d)in;
1105}
1106
1107static inline void __attribute__((__always_inline__)) _mm_pause(void)
1108{
1109  asm("pause");
1110}
1111
1112#define _MM_SHUFFLE(x, y) (((x) << 1) | (y))
1113
1114#endif /* __SSE2__ */
1115
1116#endif /* __EMMINTRIN_H */
1117