xmmintrin.h revision a2f12ae0e3893cfa703abbe43c74d513abebe1a1
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef float __v4sf __attribute__((__vector_size__(16)));
34typedef float __m128 __attribute__((__vector_size__(16)));
35
36#include <mm_malloc.h>
37
38static inline __m128 __attribute__((__always_inline__, __nodebug__))
39_mm_add_ss(__m128 a, __m128 b)
40{
41  return __builtin_ia32_addss(a, b);
42}
43
44static inline __m128 __attribute__((__always_inline__, __nodebug__))
45_mm_add_ps(__m128 a, __m128 b)
46{
47  return a + b;
48}
49
50static inline __m128 __attribute__((__always_inline__, __nodebug__))
51_mm_sub_ss(__m128 a, __m128 b)
52{
53  return __builtin_ia32_subss(a, b);
54}
55
56static inline __m128 __attribute__((__always_inline__, __nodebug__))
57_mm_sub_ps(__m128 a, __m128 b)
58{
59  return a - b;
60}
61
62static inline __m128 __attribute__((__always_inline__, __nodebug__))
63_mm_mul_ss(__m128 a, __m128 b)
64{
65  return __builtin_ia32_mulss(a, b);
66}
67
68static inline __m128 __attribute__((__always_inline__, __nodebug__))
69_mm_mul_ps(__m128 a, __m128 b)
70{
71  return a * b;
72}
73
74static inline __m128 __attribute__((__always_inline__, __nodebug__))
75_mm_div_ss(__m128 a, __m128 b)
76{
77  return __builtin_ia32_divss(a, b);
78}
79
80static inline __m128 __attribute__((__always_inline__, __nodebug__))
81_mm_div_ps(__m128 a, __m128 b)
82{
83  return a / b;
84}
85
86static inline __m128 __attribute__((__always_inline__, __nodebug__))
87_mm_sqrt_ss(__m128 a)
88{
89  return __builtin_ia32_sqrtss(a);
90}
91
92static inline __m128 __attribute__((__always_inline__, __nodebug__))
93_mm_sqrt_ps(__m128 a)
94{
95  return __builtin_ia32_sqrtps(a);
96}
97
98static inline __m128 __attribute__((__always_inline__, __nodebug__))
99_mm_rcp_ss(__m128 a)
100{
101  return __builtin_ia32_rcpss(a);
102}
103
104static inline __m128 __attribute__((__always_inline__, __nodebug__))
105_mm_rcp_ps(__m128 a)
106{
107  return __builtin_ia32_rcpps(a);
108}
109
110static inline __m128 __attribute__((__always_inline__, __nodebug__))
111_mm_rsqrt_ss(__m128 a)
112{
113  return __builtin_ia32_rsqrtss(a);
114}
115
116static inline __m128 __attribute__((__always_inline__, __nodebug__))
117_mm_rsqrt_ps(__m128 a)
118{
119  return __builtin_ia32_rsqrtps(a);
120}
121
122static inline __m128 __attribute__((__always_inline__, __nodebug__))
123_mm_min_ss(__m128 a, __m128 b)
124{
125  return __builtin_ia32_minss(a, b);
126}
127
128static inline __m128 __attribute__((__always_inline__, __nodebug__))
129_mm_min_ps(__m128 a, __m128 b)
130{
131  return __builtin_ia32_minps(a, b);
132}
133
134static inline __m128 __attribute__((__always_inline__, __nodebug__))
135_mm_max_ss(__m128 a, __m128 b)
136{
137  return __builtin_ia32_maxss(a, b);
138}
139
140static inline __m128 __attribute__((__always_inline__, __nodebug__))
141_mm_max_ps(__m128 a, __m128 b)
142{
143  return __builtin_ia32_maxps(a, b);
144}
145
146static inline __m128 __attribute__((__always_inline__, __nodebug__))
147_mm_and_ps(__m128 a, __m128 b)
148{
149  return __builtin_ia32_andps(a, b);
150}
151
152static inline __m128 __attribute__((__always_inline__, __nodebug__))
153_mm_andnot_ps(__m128 a, __m128 b)
154{
155  return __builtin_ia32_andnps(a, b);
156}
157
158static inline __m128 __attribute__((__always_inline__, __nodebug__))
159_mm_or_ps(__m128 a, __m128 b)
160{
161  return __builtin_ia32_orps(a, b);
162}
163
164static inline __m128 __attribute__((__always_inline__, __nodebug__))
165_mm_xor_ps(__m128 a, __m128 b)
166{
167  return __builtin_ia32_xorps(a, b);
168}
169
170static inline __m128 __attribute__((__always_inline__, __nodebug__))
171_mm_cmpeq_ss(__m128 a, __m128 b)
172{
173  return (__m128)__builtin_ia32_cmpeqss(a, b);
174}
175
176static inline __m128 __attribute__((__always_inline__, __nodebug__))
177_mm_cmpeq_ps(__m128 a, __m128 b)
178{
179  return (__m128)__builtin_ia32_cmpeqps(a, b);
180}
181
182static inline __m128 __attribute__((__always_inline__, __nodebug__))
183_mm_cmplt_ss(__m128 a, __m128 b)
184{
185  return (__m128)__builtin_ia32_cmpltss(a, b);
186}
187
188static inline __m128 __attribute__((__always_inline__, __nodebug__))
189_mm_cmplt_ps(__m128 a, __m128 b)
190{
191  return (__m128)__builtin_ia32_cmpltps(a, b);
192}
193
194static inline __m128 __attribute__((__always_inline__, __nodebug__))
195_mm_cmple_ss(__m128 a, __m128 b)
196{
197  return (__m128)__builtin_ia32_cmpless(a, b);
198}
199
200static inline __m128 __attribute__((__always_inline__, __nodebug__))
201_mm_cmple_ps(__m128 a, __m128 b)
202{
203  return (__m128)__builtin_ia32_cmpleps(a, b);
204}
205
206static inline __m128 __attribute__((__always_inline__, __nodebug__))
207_mm_cmpgt_ss(__m128 a, __m128 b)
208{
209  return (__m128)__builtin_ia32_cmpltss(b, a);
210}
211
212static inline __m128 __attribute__((__always_inline__, __nodebug__))
213_mm_cmpgt_ps(__m128 a, __m128 b)
214{
215  return (__m128)__builtin_ia32_cmpltps(b, a);
216}
217
218static inline __m128 __attribute__((__always_inline__, __nodebug__))
219_mm_cmpge_ss(__m128 a, __m128 b)
220{
221  return (__m128)__builtin_ia32_cmpless(b, a);
222}
223
224static inline __m128 __attribute__((__always_inline__, __nodebug__))
225_mm_cmpge_ps(__m128 a, __m128 b)
226{
227  return (__m128)__builtin_ia32_cmpleps(b, a);
228}
229
230static inline __m128 __attribute__((__always_inline__, __nodebug__))
231_mm_cmpneq_ss(__m128 a, __m128 b)
232{
233  return (__m128)__builtin_ia32_cmpneqss(a, b);
234}
235
236static inline __m128 __attribute__((__always_inline__, __nodebug__))
237_mm_cmpneq_ps(__m128 a, __m128 b)
238{
239  return (__m128)__builtin_ia32_cmpneqps(a, b);
240}
241
242static inline __m128 __attribute__((__always_inline__, __nodebug__))
243_mm_cmpnlt_ss(__m128 a, __m128 b)
244{
245  return (__m128)__builtin_ia32_cmpnltss(a, b);
246}
247
248static inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_cmpnlt_ps(__m128 a, __m128 b)
250{
251  return (__m128)__builtin_ia32_cmpnltps(a, b);
252}
253
254static inline __m128 __attribute__((__always_inline__, __nodebug__))
255_mm_cmpnle_ss(__m128 a, __m128 b)
256{
257  return (__m128)__builtin_ia32_cmpnless(a, b);
258}
259
260static inline __m128 __attribute__((__always_inline__, __nodebug__))
261_mm_cmpnle_ps(__m128 a, __m128 b)
262{
263  return (__m128)__builtin_ia32_cmpnleps(a, b);
264}
265
266static inline __m128 __attribute__((__always_inline__, __nodebug__))
267_mm_cmpngt_ss(__m128 a, __m128 b)
268{
269  return (__m128)__builtin_ia32_cmpnltss(b, a);
270}
271
272static inline __m128 __attribute__((__always_inline__, __nodebug__))
273_mm_cmpngt_ps(__m128 a, __m128 b)
274{
275  return (__m128)__builtin_ia32_cmpnltps(b, a);
276}
277
278static inline __m128 __attribute__((__always_inline__, __nodebug__))
279_mm_cmpnge_ss(__m128 a, __m128 b)
280{
281  return (__m128)__builtin_ia32_cmpnless(b, a);
282}
283
284static inline __m128 __attribute__((__always_inline__, __nodebug__))
285_mm_cmpnge_ps(__m128 a, __m128 b)
286{
287  return (__m128)__builtin_ia32_cmpnleps(b, a);
288}
289
290static inline __m128 __attribute__((__always_inline__, __nodebug__))
291_mm_cmpord_ss(__m128 a, __m128 b)
292{
293  return (__m128)__builtin_ia32_cmpordss(a, b);
294}
295
296static inline __m128 __attribute__((__always_inline__, __nodebug__))
297_mm_cmpord_ps(__m128 a, __m128 b)
298{
299  return (__m128)__builtin_ia32_cmpordps(a, b);
300}
301
302static inline __m128 __attribute__((__always_inline__, __nodebug__))
303_mm_cmpunord_ss(__m128 a, __m128 b)
304{
305  return (__m128)__builtin_ia32_cmpunordss(a, b);
306}
307
308static inline __m128 __attribute__((__always_inline__, __nodebug__))
309_mm_cmpunord_ps(__m128 a, __m128 b)
310{
311  return (__m128)__builtin_ia32_cmpunordps(a, b);
312}
313
314static inline int __attribute__((__always_inline__, __nodebug__))
315_mm_comieq_ss(__m128 a, __m128 b)
316{
317  return __builtin_ia32_comieq(a, b);
318}
319
320static inline int __attribute__((__always_inline__, __nodebug__))
321_mm_comilt_ss(__m128 a, __m128 b)
322{
323  return __builtin_ia32_comilt(a, b);
324}
325
326static inline int __attribute__((__always_inline__, __nodebug__))
327_mm_comile_ss(__m128 a, __m128 b)
328{
329  return __builtin_ia32_comile(a, b);
330}
331
332static inline int __attribute__((__always_inline__, __nodebug__))
333_mm_comigt_ss(__m128 a, __m128 b)
334{
335  return __builtin_ia32_comigt(a, b);
336}
337
338static inline int __attribute__((__always_inline__, __nodebug__))
339_mm_comige_ss(__m128 a, __m128 b)
340{
341  return __builtin_ia32_comige(a, b);
342}
343
344static inline int __attribute__((__always_inline__, __nodebug__))
345_mm_comineq_ss(__m128 a, __m128 b)
346{
347  return __builtin_ia32_comineq(a, b);
348}
349
350static inline int __attribute__((__always_inline__, __nodebug__))
351_mm_ucomieq_ss(__m128 a, __m128 b)
352{
353  return __builtin_ia32_ucomieq(a, b);
354}
355
356static inline int __attribute__((__always_inline__, __nodebug__))
357_mm_ucomilt_ss(__m128 a, __m128 b)
358{
359  return __builtin_ia32_ucomilt(a, b);
360}
361
362static inline int __attribute__((__always_inline__, __nodebug__))
363_mm_ucomile_ss(__m128 a, __m128 b)
364{
365  return __builtin_ia32_ucomile(a, b);
366}
367
368static inline int __attribute__((__always_inline__, __nodebug__))
369_mm_ucomigt_ss(__m128 a, __m128 b)
370{
371  return __builtin_ia32_ucomigt(a, b);
372}
373
374static inline int __attribute__((__always_inline__, __nodebug__))
375_mm_ucomige_ss(__m128 a, __m128 b)
376{
377  return __builtin_ia32_ucomige(a, b);
378}
379
380static inline int __attribute__((__always_inline__, __nodebug__))
381_mm_ucomineq_ss(__m128 a, __m128 b)
382{
383  return __builtin_ia32_ucomineq(a, b);
384}
385
386static inline int __attribute__((__always_inline__, __nodebug__))
387_mm_cvtss_si32(__m128 a)
388{
389  return __builtin_ia32_cvtss2si(a);
390}
391
392static inline long long __attribute__((__always_inline__, __nodebug__))
393_mm_cvtss_si64(__m128 a)
394{
395  return __builtin_ia32_cvtss2si64(a);
396}
397
398static inline __m64 __attribute__((__always_inline__, __nodebug__))
399_mm_cvtps_pi32(__m128 a)
400{
401  return (__m64)__builtin_ia32_cvtps2pi(a);
402}
403
404static inline int __attribute__((__always_inline__, __nodebug__))
405_mm_cvttss_si32(__m128 a)
406{
407  return __builtin_ia32_cvttss2si(a);
408}
409
410static inline long long __attribute__((__always_inline__, __nodebug__))
411_mm_cvttss_si64(__m128 a)
412{
413  return __builtin_ia32_cvttss2si64(a);
414}
415
416static inline __m64 __attribute__((__always_inline__, __nodebug__))
417_mm_cvttps_pi32(__m128 a)
418{
419  return (__m64)__builtin_ia32_cvttps2pi(a);
420}
421
422static inline __m128 __attribute__((__always_inline__, __nodebug__))
423_mm_cvtsi32_ss(__m128 a, int b)
424{
425  return __builtin_ia32_cvtsi2ss(a, b);
426}
427
428#ifdef __x86_64__
429
430static inline __m128 __attribute__((__always_inline__, __nodebug__))
431_mm_cvtsi64_ss(__m128 a, long long b)
432{
433  return __builtin_ia32_cvtsi642ss(a, b);
434}
435
436#endif
437
438static inline __m128 __attribute__((__always_inline__, __nodebug__))
439_mm_cvtpi32_ps(__m128 a, __m64 b)
440{
441  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
442}
443
444static inline float __attribute__((__always_inline__, __nodebug__))
445_mm_cvtss_f32(__m128 a)
446{
447  return a[0];
448}
449
450static inline __m128 __attribute__((__always_inline__, __nodebug__))
451_mm_loadh_pi(__m128 a, __m64 const *p)
452{
453  return __builtin_ia32_loadhps(a, (__v2si *)p);
454}
455
456static inline __m128 __attribute__((__always_inline__, __nodebug__))
457_mm_loadl_pi(__m128 a, __m64 const *p)
458{
459  return __builtin_ia32_loadlps(a, (__v2si *)p);
460}
461
462static inline __m128 __attribute__((__always_inline__, __nodebug__))
463_mm_load_ss(float *p)
464{
465  return (__m128){ *p, 0, 0, 0 };
466}
467
468static inline __m128 __attribute__((__always_inline__, __nodebug__))
469_mm_load1_ps(float *p)
470{
471  return (__m128){ *p, *p, *p, *p };
472}
473
474static inline __m128 __attribute__((__always_inline__, __nodebug__))
475_mm_load_ps(float *p)
476{
477  return *(__m128*)p;
478}
479
480static inline __m128 __attribute__((__always_inline__, __nodebug__))
481_mm_loadu_ps(float *p)
482{
483  return __builtin_ia32_loadups(p);
484}
485
486static inline __m128 __attribute__((__always_inline__, __nodebug__))
487_mm_loadr_ps(float *p)
488{
489  __m128 a = _mm_load_ps(p);
490  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
491}
492
493static inline __m128 __attribute__((__always_inline__, __nodebug__))
494_mm_set_ss(float w)
495{
496  return (__m128){ w, 0, 0, 0 };
497}
498
499static inline __m128 __attribute__((__always_inline__, __nodebug__))
500_mm_set1_ps(float w)
501{
502  return (__m128){ w, w, w, w };
503}
504
505// Microsoft specific.
506static inline __m128 __attribute__((__always_inline__, __nodebug__))
507_mm_set_ps1(float w)
508{
509    return _mm_set1_ps(w);
510}
511
512static inline __m128 __attribute__((__always_inline__, __nodebug__))
513_mm_set_ps(float z, float y, float x, float w)
514{
515  return (__m128){ w, x, y, z };
516}
517
518static inline __m128 __attribute__((__always_inline__, __nodebug__))
519_mm_setr_ps(float z, float y, float x, float w)
520{
521  return (__m128){ z, y, x, w };
522}
523
524static inline __m128 __attribute__((__always__inline__))
525_mm_setzero_ps(void)
526{
527  return (__m128){ 0, 0, 0, 0 };
528}
529
530static inline void __attribute__((__always__inline__))
531_mm_storeh_pi(__m64 *p, __m128 a)
532{
533  __builtin_ia32_storehps((__v2si *)p, a);
534}
535
536static inline void __attribute__((__always__inline__))
537_mm_storel_pi(__m64 *p, __m128 a)
538{
539  __builtin_ia32_storelps((__v2si *)p, a);
540}
541
542static inline void __attribute__((__always__inline__))
543_mm_store_ss(float *p, __m128 a)
544{
545  *p = a[0];
546}
547
548static inline void __attribute__((__always_inline__, __nodebug__))
549_mm_storeu_ps(float *p, __m128 a)
550{
551  __builtin_ia32_storeups(p, a);
552}
553
554static inline void __attribute__((__always_inline__, __nodebug__))
555_mm_store1_ps(float *p, __m128 a)
556{
557  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
558  _mm_storeu_ps(p, a);
559}
560
561static inline void __attribute__((__always_inline__, __nodebug__))
562_mm_store_ps(float *p, __m128 a)
563{
564  *(__m128 *)p = a;
565}
566
567static inline void __attribute__((__always_inline__, __nodebug__))
568_mm_storer_ps(float *p, __m128 a)
569{
570  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
571  _mm_store_ps(p, a);
572}
573
574#define _MM_HINT_T0 1
575#define _MM_HINT_T1 2
576#define _MM_HINT_T2 3
577#define _MM_HINT_NTA 0
578
579/* FIXME: We have to #define this because "sel" must be a constant integer, and
580   Sema doesn't do any form of constant propagation yet. */
581
582#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
583
584static inline void __attribute__((__always_inline__, __nodebug__))
585_mm_stream_pi(__m64 *p, __m64 a)
586{
587  __builtin_ia32_movntq(p, a);
588}
589
590static inline void __attribute__((__always_inline__, __nodebug__))
591_mm_stream_ps(float *p, __m128 a)
592{
593  __builtin_ia32_movntps(p, a);
594}
595
596static inline void __attribute__((__always_inline__, __nodebug__))
597_mm_sfence(void)
598{
599  __builtin_ia32_sfence();
600}
601
602static inline int __attribute__((__always_inline__, __nodebug__))
603_mm_extract_pi16(__m64 a, int n)
604{
605  /* FIXME:
606   * This should force n to be an immediate.
607   * This does not use the PEXTRW instruction. From looking at the LLVM source, the
608     instruction doesn't seem to be hooked up.
609   * The code could probably be made better :)
610   */
611  __v4hi b = (__v4hi)a;
612  return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
613}
614
615/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
616   the already existing __builtin_shufflevector.
617*/
618/*
619static inline __m64 __attribute__((__always_inline__, __nodebug__))
620_mm_insert_pi16(__m64 a, int d, int n)
621{
622   return (__m64){ 0LL };
623}
624*/
625
626static inline __m64 __attribute__((__always_inline__, __nodebug__))
627_mm_max_pi16(__m64 a, __m64 b)
628{
629  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
630}
631
632static inline __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_max_pu8(__m64 a, __m64 b)
634{
635  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
636}
637
638static inline __m64 __attribute__((__always_inline__, __nodebug__))
639_mm_min_pi16(__m64 a, __m64 b)
640{
641  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
642}
643
644static inline __m64 __attribute__((__always_inline__, __nodebug__))
645_mm_min_pu8(__m64 a, __m64 b)
646{
647  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
648}
649
650static inline int __attribute__((__always_inline__, __nodebug__))
651_mm_movemask_pi8(__m64 a)
652{
653  return __builtin_ia32_pmovmskb((__v8qi)a);
654}
655
656static inline __m64 __attribute__((__always_inline__, __nodebug__))
657_mm_mulhi_pu16(__m64 a, __m64 b)
658{
659  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
660}
661
662#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
663
664static inline void __attribute__((__always_inline__, __nodebug__))
665_mm_maskmove_si64(__m64 d, __m64 n, char *p)
666{
667  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
668}
669
670static inline __m64 __attribute__((__always_inline__, __nodebug__))
671_mm_avg_pu8(__m64 a, __m64 b)
672{
673  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
674}
675
676static inline __m64 __attribute__((__always_inline__, __nodebug__))
677_mm_avg_pu16(__m64 a, __m64 b)
678{
679  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
680}
681
682static inline __m64 __attribute__((__always_inline__, __nodebug___))
683_mm_sad_pu8(__m64 a, __m64 b)
684{
685  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
686}
687
688static inline unsigned int __attribute__((__always_inline__, __nodebug___))
689_mm_getcsr(void)
690{
691  return __builtin_ia32_stmxcsr();
692}
693
694static inline void __attribute__((__always_inline__, __nodebug__))
695_mm_setcsr(unsigned int i)
696{
697  __builtin_ia32_ldmxcsr(i);
698}
699
700#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
701
702static inline __m128 __attribute__((__always_inline__, __nodebug__))
703_mm_unpackhi_ps(__m128 a, __m128 b)
704{
705  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
706}
707
708static inline __m128 __attribute__((__always_inline__, __nodebug__))
709_mm_unpacklo_ps(__m128 a, __m128 b)
710{
711  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
712}
713
714static inline __m128 __attribute__((__always_inline__, __nodebug__))
715_mm_move_ss(__m128 a, __m128 b)
716{
717  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
718}
719
720static inline __m128 __attribute__((__always_inline__, __nodebug__))
721_mm_movehl_ps(__m128 a, __m128 b)
722{
723  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
724}
725
726static inline __m128 __attribute__((__always_inline__, __nodebug__))
727_mm_movelh_ps(__m128 a, __m128 b)
728{
729  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
730}
731
732static inline __m128 __attribute__((__always_inline__, __nodebug__))
733_mm_cvtpi16_ps(__m64 a)
734{
735  __m64 b, c;
736  __m128 r;
737
738  b = _mm_setzero_si64();
739  b = _mm_cmpgt_pi16(b, a);
740  c = _mm_unpackhi_pi16(a, b);
741  r = _mm_setzero_ps();
742  r = _mm_cvtpi32_ps(r, c);
743  r = _mm_movelh_ps(r, r);
744  c = _mm_unpacklo_pi16(a, b);
745  r = _mm_cvtpi32_ps(r, c);
746
747  return r;
748}
749
750static inline __m128 __attribute__((__always_inline__, __nodebug__))
751_mm_cvtpu16_ps(__m64 a)
752{
753  __m64 b, c;
754  __m128 r;
755
756  b = _mm_setzero_si64();
757  c = _mm_unpackhi_pi16(a, b);
758  r = _mm_setzero_ps();
759  r = _mm_cvtpi32_ps(r, c);
760  r = _mm_movelh_ps(r, r);
761  c = _mm_unpacklo_pi16(a, b);
762  r = _mm_cvtpi32_ps(r, c);
763
764  return r;
765}
766
767static inline __m128 __attribute__((__always_inline__, __nodebug__))
768_mm_cvtpi8_ps(__m64 a)
769{
770  __m64 b;
771
772  b = _mm_setzero_si64();
773  b = _mm_cmpgt_pi8(b, a);
774  b = _mm_unpacklo_pi8(a, b);
775
776  return _mm_cvtpi16_ps(b);
777}
778
779static inline __m128 __attribute__((__always_inline__, __nodebug__))
780_mm_cvtpu8_ps(__m64 a)
781{
782  __m64 b;
783
784  b = _mm_setzero_si64();
785  b = _mm_unpacklo_pi8(a, b);
786
787  return _mm_cvtpi16_ps(b);
788}
789
790static inline __m128 __attribute__((__always_inline__, __nodebug__))
791_mm_cvtpi32x2_ps(__m64 a, __m64 b)
792{
793  __m128 c;
794
795  c = _mm_setzero_ps();
796  c = _mm_cvtpi32_ps(c, b);
797  c = _mm_movelh_ps(c, c);
798
799  return _mm_cvtpi32_ps(c, a);
800}
801
802static inline __m64 __attribute__((__always_inline__, __nodebug__))
803_mm_cvtps_pi16(__m128 a)
804{
805  __m64 b, c;
806
807  b = _mm_cvtps_pi32(a);
808  a = _mm_movehl_ps(a, a);
809  c = _mm_cvtps_pi32(a);
810
811  return _mm_packs_pi16(b, c);
812}
813
814static inline __m64 __attribute__((__always_inline__, __nodebug__))
815_mm_cvtps_pi8(__m128 a)
816{
817  __m64 b, c;
818
819  b = _mm_cvtps_pi16(a);
820  c = _mm_setzero_si64();
821
822  return _mm_packs_pi16(b, c);
823}
824
825static inline int __attribute__((__always_inline__, __nodebug__))
826_mm_movemask_ps(__m128 a)
827{
828  return __builtin_ia32_movmskps(a);
829}
830
831#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
832
833#define _MM_EXCEPT_INVALID    (0x0001)
834#define _MM_EXCEPT_DENORM     (0x0002)
835#define _MM_EXCEPT_DIV_ZERO   (0x0004)
836#define _MM_EXCEPT_OVERFLOW   (0x0008)
837#define _MM_EXCEPT_UNDERFLOW  (0x0010)
838#define _MM_EXCEPT_INEXACT    (0x0020)
839#define _MM_EXCEPT_MASK       (0x003f)
840
841#define _MM_MASK_INVALID      (0x0080)
842#define _MM_MASK_DENORM       (0x0100)
843#define _MM_EXCEPT_DIV_ZERO   (0x0200)
844#define _MM_EXCEPT_OVERFLOW   (0x0400)
845#define _MM_EXCEPT_UNDERFLOW  (0x0800)
846#define _MM_EXCEPT_INEXACT    (0x1000)
847#define _MM_MASK_MASK         (0x1f80)
848
849#define _MM_ROUND_NEAREST     (0x0000)
850#define _MM_ROUND_DOWN        (0x2000)
851#define _MM_ROUND_UP          (0x4000)
852#define _MM_ROUND_TOWARD_ZERO (0x6000)
853#define _MM_ROUND_MASK        (0x6000)
854
855#define _MM_FLUSH_ZERO_MASK   (0x8000)
856#define _MM_FLUSH_ZERO_ON     (0x8000)
857#define _MM_FLUSH_ZERO_OFF    (0x8000)
858
859#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
860#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
861#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
862#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
863
864#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
865#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
866#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
867#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
868
869#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
870do { \
871  __m128 tmp3, tmp2, tmp1, tmp0; \
872  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
873  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
874  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
875  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
876  (row0) = _mm_movelh_ps(tmp0, tmp2); \
877  (row1) = _mm_movehl_ps(tmp2, tmp0); \
878  (row2) = _mm_movelh_ps(tmp1, tmp3); \
879  (row3) = _mm_movelh_ps(tmp3, tmp1); \
880} while (0)
881
882#include <emmintrin.h>
883
884#endif /* __SSE__ */
885
886#endif /* __XMMINTRIN_H */
887