xmmintrin.h revision 17d2e3a7d15dc809a25896973d4aa2205e63c122
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef float __v4sf __attribute__((__vector_size__(16)));
34typedef float __m128 __attribute__((__vector_size__(16)));
35
36#include <mm_malloc.h>
37
38static inline __m128 __attribute__((__always_inline__, __nodebug__))
39_mm_add_ss(__m128 a, __m128 b)
40{
41  a[0] += b[0];
42  return a;
43}
44
45static inline __m128 __attribute__((__always_inline__, __nodebug__))
46_mm_add_ps(__m128 a, __m128 b)
47{
48  return a + b;
49}
50
51static inline __m128 __attribute__((__always_inline__, __nodebug__))
52_mm_sub_ss(__m128 a, __m128 b)
53{
54  a[0] -= b[0];
55  return a;
56}
57
58static inline __m128 __attribute__((__always_inline__, __nodebug__))
59_mm_sub_ps(__m128 a, __m128 b)
60{
61  return a - b;
62}
63
64static inline __m128 __attribute__((__always_inline__, __nodebug__))
65_mm_mul_ss(__m128 a, __m128 b)
66{
67  a[0] *= b[0];
68  return a;
69}
70
71static inline __m128 __attribute__((__always_inline__, __nodebug__))
72_mm_mul_ps(__m128 a, __m128 b)
73{
74  return a * b;
75}
76
77static inline __m128 __attribute__((__always_inline__, __nodebug__))
78_mm_div_ss(__m128 a, __m128 b)
79{
80  a[0] /= b[0];
81  return a;
82}
83
84static inline __m128 __attribute__((__always_inline__, __nodebug__))
85_mm_div_ps(__m128 a, __m128 b)
86{
87  return a / b;
88}
89
90static inline __m128 __attribute__((__always_inline__, __nodebug__))
91_mm_sqrt_ss(__m128 a)
92{
93  return __builtin_ia32_sqrtss(a);
94}
95
96static inline __m128 __attribute__((__always_inline__, __nodebug__))
97_mm_sqrt_ps(__m128 a)
98{
99  return __builtin_ia32_sqrtps(a);
100}
101
102static inline __m128 __attribute__((__always_inline__, __nodebug__))
103_mm_rcp_ss(__m128 a)
104{
105  return __builtin_ia32_rcpss(a);
106}
107
108static inline __m128 __attribute__((__always_inline__, __nodebug__))
109_mm_rcp_ps(__m128 a)
110{
111  return __builtin_ia32_rcpps(a);
112}
113
114static inline __m128 __attribute__((__always_inline__, __nodebug__))
115_mm_rsqrt_ss(__m128 a)
116{
117  return __builtin_ia32_rsqrtss(a);
118}
119
120static inline __m128 __attribute__((__always_inline__, __nodebug__))
121_mm_rsqrt_ps(__m128 a)
122{
123  return __builtin_ia32_rsqrtps(a);
124}
125
126static inline __m128 __attribute__((__always_inline__, __nodebug__))
127_mm_min_ss(__m128 a, __m128 b)
128{
129  return __builtin_ia32_minss(a, b);
130}
131
132static inline __m128 __attribute__((__always_inline__, __nodebug__))
133_mm_min_ps(__m128 a, __m128 b)
134{
135  return __builtin_ia32_minps(a, b);
136}
137
138static inline __m128 __attribute__((__always_inline__, __nodebug__))
139_mm_max_ss(__m128 a, __m128 b)
140{
141  return __builtin_ia32_maxss(a, b);
142}
143
144static inline __m128 __attribute__((__always_inline__, __nodebug__))
145_mm_max_ps(__m128 a, __m128 b)
146{
147  return __builtin_ia32_maxps(a, b);
148}
149
150static inline __m128 __attribute__((__always_inline__, __nodebug__))
151_mm_and_ps(__m128 a, __m128 b)
152{
153  typedef int __v4si __attribute__((__vector_size__(16)));
154  return (__m128)((__v4si)a & (__v4si)b);
155}
156
157static inline __m128 __attribute__((__always_inline__, __nodebug__))
158_mm_andnot_ps(__m128 a, __m128 b)
159{
160  typedef int __v4si __attribute__((__vector_size__(16)));
161  return (__m128)(~(__v4si)a & (__v4si)b);
162}
163
164static inline __m128 __attribute__((__always_inline__, __nodebug__))
165_mm_or_ps(__m128 a, __m128 b)
166{
167  typedef int __v4si __attribute__((__vector_size__(16)));
168  return (__m128)((__v4si)a | (__v4si)b);
169}
170
171static inline __m128 __attribute__((__always_inline__, __nodebug__))
172_mm_xor_ps(__m128 a, __m128 b)
173{
174  typedef int __v4si __attribute__((__vector_size__(16)));
175  return (__m128)((__v4si)a ^ ~(__v4si)b);
176}
177
178static inline __m128 __attribute__((__always_inline__, __nodebug__))
179_mm_cmpeq_ss(__m128 a, __m128 b)
180{
181  return (__m128)__builtin_ia32_cmpss(a, b, 0);
182}
183
184static inline __m128 __attribute__((__always_inline__, __nodebug__))
185_mm_cmpeq_ps(__m128 a, __m128 b)
186{
187  return (__m128)__builtin_ia32_cmpps(a, b, 0);
188}
189
190static inline __m128 __attribute__((__always_inline__, __nodebug__))
191_mm_cmplt_ss(__m128 a, __m128 b)
192{
193  return (__m128)__builtin_ia32_cmpss(a, b, 1);
194}
195
196static inline __m128 __attribute__((__always_inline__, __nodebug__))
197_mm_cmplt_ps(__m128 a, __m128 b)
198{
199  return (__m128)__builtin_ia32_cmpps(a, b, 1);
200}
201
202static inline __m128 __attribute__((__always_inline__, __nodebug__))
203_mm_cmple_ss(__m128 a, __m128 b)
204{
205  return (__m128)__builtin_ia32_cmpss(a, b, 2);
206}
207
208static inline __m128 __attribute__((__always_inline__, __nodebug__))
209_mm_cmple_ps(__m128 a, __m128 b)
210{
211  return (__m128)__builtin_ia32_cmpps(a, b, 2);
212}
213
214static inline __m128 __attribute__((__always_inline__, __nodebug__))
215_mm_cmpgt_ss(__m128 a, __m128 b)
216{
217  return (__m128)__builtin_ia32_cmpss(b, a, 1);
218}
219
220static inline __m128 __attribute__((__always_inline__, __nodebug__))
221_mm_cmpgt_ps(__m128 a, __m128 b)
222{
223  return (__m128)__builtin_ia32_cmpps(b, a, 1);
224}
225
226static inline __m128 __attribute__((__always_inline__, __nodebug__))
227_mm_cmpge_ss(__m128 a, __m128 b)
228{
229  return (__m128)__builtin_ia32_cmpss(b, a, 2);
230}
231
232static inline __m128 __attribute__((__always_inline__, __nodebug__))
233_mm_cmpge_ps(__m128 a, __m128 b)
234{
235  return (__m128)__builtin_ia32_cmpps(b, a, 2);
236}
237
238static inline __m128 __attribute__((__always_inline__, __nodebug__))
239_mm_cmpneq_ss(__m128 a, __m128 b)
240{
241  return (__m128)__builtin_ia32_cmpss(a, b, 4);
242}
243
244static inline __m128 __attribute__((__always_inline__, __nodebug__))
245_mm_cmpneq_ps(__m128 a, __m128 b)
246{
247  return (__m128)__builtin_ia32_cmpps(a, b, 4);
248}
249
250static inline __m128 __attribute__((__always_inline__, __nodebug__))
251_mm_cmpnlt_ss(__m128 a, __m128 b)
252{
253  return (__m128)__builtin_ia32_cmpss(a, b, 5);
254}
255
256static inline __m128 __attribute__((__always_inline__, __nodebug__))
257_mm_cmpnlt_ps(__m128 a, __m128 b)
258{
259  return (__m128)__builtin_ia32_cmpps(a, b, 5);
260}
261
262static inline __m128 __attribute__((__always_inline__, __nodebug__))
263_mm_cmpnle_ss(__m128 a, __m128 b)
264{
265  return (__m128)__builtin_ia32_cmpss(a, b, 6);
266}
267
268static inline __m128 __attribute__((__always_inline__, __nodebug__))
269_mm_cmpnle_ps(__m128 a, __m128 b)
270{
271  return (__m128)__builtin_ia32_cmpps(a, b, 6);
272}
273
274static inline __m128 __attribute__((__always_inline__, __nodebug__))
275_mm_cmpngt_ss(__m128 a, __m128 b)
276{
277  return (__m128)__builtin_ia32_cmpss(b, a, 5);
278}
279
280static inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_cmpngt_ps(__m128 a, __m128 b)
282{
283  return (__m128)__builtin_ia32_cmpps(b, a, 5);
284}
285
286static inline __m128 __attribute__((__always_inline__, __nodebug__))
287_mm_cmpnge_ss(__m128 a, __m128 b)
288{
289  return (__m128)__builtin_ia32_cmpss(b, a, 6);
290}
291
292static inline __m128 __attribute__((__always_inline__, __nodebug__))
293_mm_cmpnge_ps(__m128 a, __m128 b)
294{
295  return (__m128)__builtin_ia32_cmpps(b, a, 6);
296}
297
298static inline __m128 __attribute__((__always_inline__, __nodebug__))
299_mm_cmpord_ss(__m128 a, __m128 b)
300{
301  return (__m128)__builtin_ia32_cmpss(a, b, 7);
302}
303
304static inline __m128 __attribute__((__always_inline__, __nodebug__))
305_mm_cmpord_ps(__m128 a, __m128 b)
306{
307  return (__m128)__builtin_ia32_cmpps(a, b, 7);
308}
309
310static inline __m128 __attribute__((__always_inline__, __nodebug__))
311_mm_cmpunord_ss(__m128 a, __m128 b)
312{
313  return (__m128)__builtin_ia32_cmpss(a, b, 3);
314}
315
316static inline __m128 __attribute__((__always_inline__, __nodebug__))
317_mm_cmpunord_ps(__m128 a, __m128 b)
318{
319  return (__m128)__builtin_ia32_cmpps(a, b, 3);
320}
321
322static inline int __attribute__((__always_inline__, __nodebug__))
323_mm_comieq_ss(__m128 a, __m128 b)
324{
325  return __builtin_ia32_comieq(a, b);
326}
327
328static inline int __attribute__((__always_inline__, __nodebug__))
329_mm_comilt_ss(__m128 a, __m128 b)
330{
331  return __builtin_ia32_comilt(a, b);
332}
333
334static inline int __attribute__((__always_inline__, __nodebug__))
335_mm_comile_ss(__m128 a, __m128 b)
336{
337  return __builtin_ia32_comile(a, b);
338}
339
340static inline int __attribute__((__always_inline__, __nodebug__))
341_mm_comigt_ss(__m128 a, __m128 b)
342{
343  return __builtin_ia32_comigt(a, b);
344}
345
346static inline int __attribute__((__always_inline__, __nodebug__))
347_mm_comige_ss(__m128 a, __m128 b)
348{
349  return __builtin_ia32_comige(a, b);
350}
351
352static inline int __attribute__((__always_inline__, __nodebug__))
353_mm_comineq_ss(__m128 a, __m128 b)
354{
355  return __builtin_ia32_comineq(a, b);
356}
357
358static inline int __attribute__((__always_inline__, __nodebug__))
359_mm_ucomieq_ss(__m128 a, __m128 b)
360{
361  return __builtin_ia32_ucomieq(a, b);
362}
363
364static inline int __attribute__((__always_inline__, __nodebug__))
365_mm_ucomilt_ss(__m128 a, __m128 b)
366{
367  return __builtin_ia32_ucomilt(a, b);
368}
369
370static inline int __attribute__((__always_inline__, __nodebug__))
371_mm_ucomile_ss(__m128 a, __m128 b)
372{
373  return __builtin_ia32_ucomile(a, b);
374}
375
376static inline int __attribute__((__always_inline__, __nodebug__))
377_mm_ucomigt_ss(__m128 a, __m128 b)
378{
379  return __builtin_ia32_ucomigt(a, b);
380}
381
382static inline int __attribute__((__always_inline__, __nodebug__))
383_mm_ucomige_ss(__m128 a, __m128 b)
384{
385  return __builtin_ia32_ucomige(a, b);
386}
387
388static inline int __attribute__((__always_inline__, __nodebug__))
389_mm_ucomineq_ss(__m128 a, __m128 b)
390{
391  return __builtin_ia32_ucomineq(a, b);
392}
393
394static inline int __attribute__((__always_inline__, __nodebug__))
395_mm_cvtss_si32(__m128 a)
396{
397  return __builtin_ia32_cvtss2si(a);
398}
399
400#ifdef __x86_64__
401
402static inline long long __attribute__((__always_inline__, __nodebug__))
403_mm_cvtss_si64(__m128 a)
404{
405  return __builtin_ia32_cvtss2si64(a);
406}
407
408#endif
409
410static inline __m64 __attribute__((__always_inline__, __nodebug__))
411_mm_cvtps_pi32(__m128 a)
412{
413  return (__m64)__builtin_ia32_cvtps2pi(a);
414}
415
416static inline int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttss_si32(__m128 a)
418{
419  return a[0];
420}
421
422static inline long long __attribute__((__always_inline__, __nodebug__))
423_mm_cvttss_si64(__m128 a)
424{
425  return a[0];
426}
427
428static inline __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttps_pi32(__m128 a)
430{
431  return (__m64)__builtin_ia32_cvttps2pi(a);
432}
433
434static inline __m128 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtsi32_ss(__m128 a, int b)
436{
437  a[0] = b;
438  return a;
439}
440
441#ifdef __x86_64__
442
443static inline __m128 __attribute__((__always_inline__, __nodebug__))
444_mm_cvtsi64_ss(__m128 a, long long b)
445{
446  a[0] = b;
447  return a;
448}
449
450#endif
451
452static inline __m128 __attribute__((__always_inline__, __nodebug__))
453_mm_cvtpi32_ps(__m128 a, __m64 b)
454{
455  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
456}
457
458static inline float __attribute__((__always_inline__, __nodebug__))
459_mm_cvtss_f32(__m128 a)
460{
461  return a[0];
462}
463
464static inline __m128 __attribute__((__always_inline__, __nodebug__))
465_mm_loadh_pi(__m128 a, __m64 const *p)
466{
467  return __builtin_ia32_loadhps(a, (__v2si *)p);
468}
469
470static inline __m128 __attribute__((__always_inline__, __nodebug__))
471_mm_loadl_pi(__m128 a, __m64 const *p)
472{
473#if 0
474  // FIXME: This should work, but gives really crappy code at the moment
475  __m128 b;
476  b[0] = *(float*)p;
477  b[1] = *((float*)p+1);
478  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
479#endif
480  return __builtin_ia32_loadlps(a, (__v2si *)p);
481}
482
483static inline __m128 __attribute__((__always_inline__, __nodebug__))
484_mm_load_ss(float *p)
485{
486  return (__m128){ *p, 0, 0, 0 };
487}
488
489static inline __m128 __attribute__((__always_inline__, __nodebug__))
490_mm_load1_ps(float *p)
491{
492  return (__m128){ *p, *p, *p, *p };
493}
494
495#define        _mm_load_ps1(p) _mm_load1_ps(p)
496
497static inline __m128 __attribute__((__always_inline__, __nodebug__))
498_mm_load_ps(float *p)
499{
500  return *(__m128*)p;
501}
502
503static inline __m128 __attribute__((__always_inline__, __nodebug__))
504_mm_loadu_ps(float *p)
505{
506  return __builtin_ia32_loadups(p);
507}
508
509static inline __m128 __attribute__((__always_inline__, __nodebug__))
510_mm_loadr_ps(float *p)
511{
512  __m128 a = _mm_load_ps(p);
513  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
514}
515
516static inline __m128 __attribute__((__always_inline__, __nodebug__))
517_mm_set_ss(float w)
518{
519  return (__m128){ w, 0, 0, 0 };
520}
521
522static inline __m128 __attribute__((__always_inline__, __nodebug__))
523_mm_set1_ps(float w)
524{
525  return (__m128){ w, w, w, w };
526}
527
528// Microsoft specific.
529static inline __m128 __attribute__((__always_inline__, __nodebug__))
530_mm_set_ps1(float w)
531{
532    return _mm_set1_ps(w);
533}
534
535static inline __m128 __attribute__((__always_inline__, __nodebug__))
536_mm_set_ps(float z, float y, float x, float w)
537{
538  return (__m128){ w, x, y, z };
539}
540
541static inline __m128 __attribute__((__always_inline__, __nodebug__))
542_mm_setr_ps(float z, float y, float x, float w)
543{
544  return (__m128){ z, y, x, w };
545}
546
547static inline __m128 __attribute__((__always_inline__))
548_mm_setzero_ps(void)
549{
550  return (__m128){ 0, 0, 0, 0 };
551}
552
553static inline void __attribute__((__always_inline__))
554_mm_storeh_pi(__m64 *p, __m128 a)
555{
556  __builtin_ia32_storehps((__v2si *)p, a);
557}
558
559static inline void __attribute__((__always_inline__))
560_mm_storel_pi(__m64 *p, __m128 a)
561{
562  __builtin_ia32_storelps((__v2si *)p, a);
563}
564
565static inline void __attribute__((__always_inline__))
566_mm_store_ss(float *p, __m128 a)
567{
568  *p = a[0];
569}
570
571static inline void __attribute__((__always_inline__, __nodebug__))
572_mm_storeu_ps(float *p, __m128 a)
573{
574  __builtin_ia32_storeups(p, a);
575}
576
577static inline void __attribute__((__always_inline__, __nodebug__))
578_mm_store1_ps(float *p, __m128 a)
579{
580  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
581  _mm_storeu_ps(p, a);
582}
583
584static inline void __attribute__((__always_inline__, __nodebug__))
585_mm_store_ps(float *p, __m128 a)
586{
587  *(__m128 *)p = a;
588}
589
590static inline void __attribute__((__always_inline__, __nodebug__))
591_mm_storer_ps(float *p, __m128 a)
592{
593  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
594  _mm_store_ps(p, a);
595}
596
597#define _MM_HINT_T0 1
598#define _MM_HINT_T1 2
599#define _MM_HINT_T2 3
600#define _MM_HINT_NTA 0
601
602/* FIXME: We have to #define this because "sel" must be a constant integer, and
603   Sema doesn't do any form of constant propagation yet. */
604
605#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
606
607static inline void __attribute__((__always_inline__, __nodebug__))
608_mm_stream_pi(__m64 *p, __m64 a)
609{
610  __builtin_ia32_movntq(p, a);
611}
612
613static inline void __attribute__((__always_inline__, __nodebug__))
614_mm_stream_ps(float *p, __m128 a)
615{
616  __builtin_ia32_movntps(p, a);
617}
618
619static inline void __attribute__((__always_inline__, __nodebug__))
620_mm_sfence(void)
621{
622  __builtin_ia32_sfence();
623}
624
625static inline int __attribute__((__always_inline__, __nodebug__))
626_mm_extract_pi16(__m64 a, int n)
627{
628  __v4hi b = (__v4hi)a;
629  return (unsigned short)b[n & 3];
630}
631
632static inline __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_insert_pi16(__m64 a, int d, int n)
634{
635   __v4hi b = (__v4hi)a;
636   b[n & 3] = d;
637   return (__m64)b;
638}
639
640static inline __m64 __attribute__((__always_inline__, __nodebug__))
641_mm_max_pi16(__m64 a, __m64 b)
642{
643  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
644}
645
646static inline __m64 __attribute__((__always_inline__, __nodebug__))
647_mm_max_pu8(__m64 a, __m64 b)
648{
649  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
650}
651
652static inline __m64 __attribute__((__always_inline__, __nodebug__))
653_mm_min_pi16(__m64 a, __m64 b)
654{
655  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
656}
657
658static inline __m64 __attribute__((__always_inline__, __nodebug__))
659_mm_min_pu8(__m64 a, __m64 b)
660{
661  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
662}
663
664static inline int __attribute__((__always_inline__, __nodebug__))
665_mm_movemask_pi8(__m64 a)
666{
667  return __builtin_ia32_pmovmskb((__v8qi)a);
668}
669
670static inline __m64 __attribute__((__always_inline__, __nodebug__))
671_mm_mulhi_pu16(__m64 a, __m64 b)
672{
673  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
674}
675
676#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
677
678static inline void __attribute__((__always_inline__, __nodebug__))
679_mm_maskmove_si64(__m64 d, __m64 n, char *p)
680{
681  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
682}
683
684static inline __m64 __attribute__((__always_inline__, __nodebug__))
685_mm_avg_pu8(__m64 a, __m64 b)
686{
687  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
688}
689
690static inline __m64 __attribute__((__always_inline__, __nodebug__))
691_mm_avg_pu16(__m64 a, __m64 b)
692{
693  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
694}
695
696static inline __m64 __attribute__((__always_inline__, __nodebug__))
697_mm_sad_pu8(__m64 a, __m64 b)
698{
699  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
700}
701
702static inline unsigned int __attribute__((__always_inline__, __nodebug__))
703_mm_getcsr(void)
704{
705  return __builtin_ia32_stmxcsr();
706}
707
708static inline void __attribute__((__always_inline__, __nodebug__))
709_mm_setcsr(unsigned int i)
710{
711  __builtin_ia32_ldmxcsr(i);
712}
713
714#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
715
716static inline __m128 __attribute__((__always_inline__, __nodebug__))
717_mm_unpackhi_ps(__m128 a, __m128 b)
718{
719  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
720}
721
722static inline __m128 __attribute__((__always_inline__, __nodebug__))
723_mm_unpacklo_ps(__m128 a, __m128 b)
724{
725  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
726}
727
728static inline __m128 __attribute__((__always_inline__, __nodebug__))
729_mm_move_ss(__m128 a, __m128 b)
730{
731  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
732}
733
734static inline __m128 __attribute__((__always_inline__, __nodebug__))
735_mm_movehl_ps(__m128 a, __m128 b)
736{
737  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
738}
739
740static inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_movelh_ps(__m128 a, __m128 b)
742{
743  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
744}
745
746static inline __m128 __attribute__((__always_inline__, __nodebug__))
747_mm_cvtpi16_ps(__m64 a)
748{
749  __m64 b, c;
750  __m128 r;
751
752  b = _mm_setzero_si64();
753  b = _mm_cmpgt_pi16(b, a);
754  c = _mm_unpackhi_pi16(a, b);
755  r = _mm_setzero_ps();
756  r = _mm_cvtpi32_ps(r, c);
757  r = _mm_movelh_ps(r, r);
758  c = _mm_unpacklo_pi16(a, b);
759  r = _mm_cvtpi32_ps(r, c);
760
761  return r;
762}
763
764static inline __m128 __attribute__((__always_inline__, __nodebug__))
765_mm_cvtpu16_ps(__m64 a)
766{
767  __m64 b, c;
768  __m128 r;
769
770  b = _mm_setzero_si64();
771  c = _mm_unpackhi_pi16(a, b);
772  r = _mm_setzero_ps();
773  r = _mm_cvtpi32_ps(r, c);
774  r = _mm_movelh_ps(r, r);
775  c = _mm_unpacklo_pi16(a, b);
776  r = _mm_cvtpi32_ps(r, c);
777
778  return r;
779}
780
781static inline __m128 __attribute__((__always_inline__, __nodebug__))
782_mm_cvtpi8_ps(__m64 a)
783{
784  __m64 b;
785
786  b = _mm_setzero_si64();
787  b = _mm_cmpgt_pi8(b, a);
788  b = _mm_unpacklo_pi8(a, b);
789
790  return _mm_cvtpi16_ps(b);
791}
792
793static inline __m128 __attribute__((__always_inline__, __nodebug__))
794_mm_cvtpu8_ps(__m64 a)
795{
796  __m64 b;
797
798  b = _mm_setzero_si64();
799  b = _mm_unpacklo_pi8(a, b);
800
801  return _mm_cvtpi16_ps(b);
802}
803
804static inline __m128 __attribute__((__always_inline__, __nodebug__))
805_mm_cvtpi32x2_ps(__m64 a, __m64 b)
806{
807  __m128 c;
808
809  c = _mm_setzero_ps();
810  c = _mm_cvtpi32_ps(c, b);
811  c = _mm_movelh_ps(c, c);
812
813  return _mm_cvtpi32_ps(c, a);
814}
815
816static inline __m64 __attribute__((__always_inline__, __nodebug__))
817_mm_cvtps_pi16(__m128 a)
818{
819  __m64 b, c;
820
821  b = _mm_cvtps_pi32(a);
822  a = _mm_movehl_ps(a, a);
823  c = _mm_cvtps_pi32(a);
824
825  return _mm_packs_pi16(b, c);
826}
827
828static inline __m64 __attribute__((__always_inline__, __nodebug__))
829_mm_cvtps_pi8(__m128 a)
830{
831  __m64 b, c;
832
833  b = _mm_cvtps_pi16(a);
834  c = _mm_setzero_si64();
835
836  return _mm_packs_pi16(b, c);
837}
838
839static inline int __attribute__((__always_inline__, __nodebug__))
840_mm_movemask_ps(__m128 a)
841{
842  return __builtin_ia32_movmskps(a);
843}
844
845#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
846
847#define _MM_EXCEPT_INVALID    (0x0001)
848#define _MM_EXCEPT_DENORM     (0x0002)
849#define _MM_EXCEPT_DIV_ZERO   (0x0004)
850#define _MM_EXCEPT_OVERFLOW   (0x0008)
851#define _MM_EXCEPT_UNDERFLOW  (0x0010)
852#define _MM_EXCEPT_INEXACT    (0x0020)
853#define _MM_EXCEPT_MASK       (0x003f)
854
855#define _MM_MASK_INVALID      (0x0080)
856#define _MM_MASK_DENORM       (0x0100)
857#define _MM_MASK_DIV_ZERO     (0x0200)
858#define _MM_MASK_OVERFLOW     (0x0400)
859#define _MM_MASK_UNDERFLOW    (0x0800)
860#define _MM_MASK_INEXACT      (0x1000)
861#define _MM_MASK_MASK         (0x1f80)
862
863#define _MM_ROUND_NEAREST     (0x0000)
864#define _MM_ROUND_DOWN        (0x2000)
865#define _MM_ROUND_UP          (0x4000)
866#define _MM_ROUND_TOWARD_ZERO (0x6000)
867#define _MM_ROUND_MASK        (0x6000)
868
869#define _MM_FLUSH_ZERO_MASK   (0x8000)
870#define _MM_FLUSH_ZERO_ON     (0x8000)
871#define _MM_FLUSH_ZERO_OFF    (0x8000)
872
873#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
874#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
875#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
876#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
877
878#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
879#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
880#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
881#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
882
883#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
884do { \
885  __m128 tmp3, tmp2, tmp1, tmp0; \
886  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
887  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
888  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
889  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
890  (row0) = _mm_movelh_ps(tmp0, tmp2); \
891  (row1) = _mm_movehl_ps(tmp2, tmp0); \
892  (row2) = _mm_movelh_ps(tmp1, tmp3); \
893  (row3) = _mm_movelh_ps(tmp3, tmp1); \
894} while (0)
895
896#include <emmintrin.h>
897
898#endif /* __SSE__ */
899
900#endif /* __XMMINTRIN_H */
901