1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef int __v4si __attribute__((__vector_size__(16)));
34typedef float __v4sf __attribute__((__vector_size__(16)));
35typedef float __m128 __attribute__((__vector_size__(16)));
36
37// This header should only be included in a hosted environment as it depends on
38// a standard library to provide allocation routines.
39#if __STDC_HOSTED__
40#include <mm_malloc.h>
41#endif
42
43static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44_mm_add_ss(__m128 a, __m128 b)
45{
46  a[0] += b[0];
47  return a;
48}
49
50static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51_mm_add_ps(__m128 a, __m128 b)
52{
53  return a + b;
54}
55
56static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57_mm_sub_ss(__m128 a, __m128 b)
58{
59  a[0] -= b[0];
60  return a;
61}
62
63static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64_mm_sub_ps(__m128 a, __m128 b)
65{
66  return a - b;
67}
68
69static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70_mm_mul_ss(__m128 a, __m128 b)
71{
72  a[0] *= b[0];
73  return a;
74}
75
76static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77_mm_mul_ps(__m128 a, __m128 b)
78{
79  return a * b;
80}
81
82static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83_mm_div_ss(__m128 a, __m128 b)
84{
85  a[0] /= b[0];
86  return a;
87}
88
89static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90_mm_div_ps(__m128 a, __m128 b)
91{
92  return a / b;
93}
94
95static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96_mm_sqrt_ss(__m128 a)
97{
98  return __builtin_ia32_sqrtss(a);
99}
100
101static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_ps(__m128 a)
103{
104  return __builtin_ia32_sqrtps(a);
105}
106
107static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
108_mm_rcp_ss(__m128 a)
109{
110  return __builtin_ia32_rcpss(a);
111}
112
113static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
114_mm_rcp_ps(__m128 a)
115{
116  return __builtin_ia32_rcpps(a);
117}
118
119static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
120_mm_rsqrt_ss(__m128 a)
121{
122  return __builtin_ia32_rsqrtss(a);
123}
124
125static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
126_mm_rsqrt_ps(__m128 a)
127{
128  return __builtin_ia32_rsqrtps(a);
129}
130
131static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
132_mm_min_ss(__m128 a, __m128 b)
133{
134  return __builtin_ia32_minss(a, b);
135}
136
137static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
138_mm_min_ps(__m128 a, __m128 b)
139{
140  return __builtin_ia32_minps(a, b);
141}
142
143static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
144_mm_max_ss(__m128 a, __m128 b)
145{
146  return __builtin_ia32_maxss(a, b);
147}
148
149static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
150_mm_max_ps(__m128 a, __m128 b)
151{
152  return __builtin_ia32_maxps(a, b);
153}
154
155static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
156_mm_and_ps(__m128 a, __m128 b)
157{
158  return (__m128)((__v4si)a & (__v4si)b);
159}
160
161static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
162_mm_andnot_ps(__m128 a, __m128 b)
163{
164  return (__m128)(~(__v4si)a & (__v4si)b);
165}
166
167static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
168_mm_or_ps(__m128 a, __m128 b)
169{
170  return (__m128)((__v4si)a | (__v4si)b);
171}
172
173static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
174_mm_xor_ps(__m128 a, __m128 b)
175{
176  return (__m128)((__v4si)a ^ (__v4si)b);
177}
178
179static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
180_mm_cmpeq_ss(__m128 a, __m128 b)
181{
182  return (__m128)__builtin_ia32_cmpss(a, b, 0);
183}
184
185static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
186_mm_cmpeq_ps(__m128 a, __m128 b)
187{
188  return (__m128)__builtin_ia32_cmpps(a, b, 0);
189}
190
191static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
192_mm_cmplt_ss(__m128 a, __m128 b)
193{
194  return (__m128)__builtin_ia32_cmpss(a, b, 1);
195}
196
197static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
198_mm_cmplt_ps(__m128 a, __m128 b)
199{
200  return (__m128)__builtin_ia32_cmpps(a, b, 1);
201}
202
203static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
204_mm_cmple_ss(__m128 a, __m128 b)
205{
206  return (__m128)__builtin_ia32_cmpss(a, b, 2);
207}
208
209static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
210_mm_cmple_ps(__m128 a, __m128 b)
211{
212  return (__m128)__builtin_ia32_cmpps(a, b, 2);
213}
214
215static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
216_mm_cmpgt_ss(__m128 a, __m128 b)
217{
218  return (__m128)__builtin_ia32_cmpss(b, a, 1);
219}
220
221static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
222_mm_cmpgt_ps(__m128 a, __m128 b)
223{
224  return (__m128)__builtin_ia32_cmpps(b, a, 1);
225}
226
227static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
228_mm_cmpge_ss(__m128 a, __m128 b)
229{
230  return (__m128)__builtin_ia32_cmpss(b, a, 2);
231}
232
233static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
234_mm_cmpge_ps(__m128 a, __m128 b)
235{
236  return (__m128)__builtin_ia32_cmpps(b, a, 2);
237}
238
239static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
240_mm_cmpneq_ss(__m128 a, __m128 b)
241{
242  return (__m128)__builtin_ia32_cmpss(a, b, 4);
243}
244
245static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
246_mm_cmpneq_ps(__m128 a, __m128 b)
247{
248  return (__m128)__builtin_ia32_cmpps(a, b, 4);
249}
250
251static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
252_mm_cmpnlt_ss(__m128 a, __m128 b)
253{
254  return (__m128)__builtin_ia32_cmpss(a, b, 5);
255}
256
257static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
258_mm_cmpnlt_ps(__m128 a, __m128 b)
259{
260  return (__m128)__builtin_ia32_cmpps(a, b, 5);
261}
262
263static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
264_mm_cmpnle_ss(__m128 a, __m128 b)
265{
266  return (__m128)__builtin_ia32_cmpss(a, b, 6);
267}
268
269static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
270_mm_cmpnle_ps(__m128 a, __m128 b)
271{
272  return (__m128)__builtin_ia32_cmpps(a, b, 6);
273}
274
275static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
276_mm_cmpngt_ss(__m128 a, __m128 b)
277{
278  return (__m128)__builtin_ia32_cmpss(b, a, 5);
279}
280
281static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
282_mm_cmpngt_ps(__m128 a, __m128 b)
283{
284  return (__m128)__builtin_ia32_cmpps(b, a, 5);
285}
286
287static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
288_mm_cmpnge_ss(__m128 a, __m128 b)
289{
290  return (__m128)__builtin_ia32_cmpss(b, a, 6);
291}
292
293static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_ps(__m128 a, __m128 b)
295{
296  return (__m128)__builtin_ia32_cmpps(b, a, 6);
297}
298
299static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
300_mm_cmpord_ss(__m128 a, __m128 b)
301{
302  return (__m128)__builtin_ia32_cmpss(a, b, 7);
303}
304
305static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
306_mm_cmpord_ps(__m128 a, __m128 b)
307{
308  return (__m128)__builtin_ia32_cmpps(a, b, 7);
309}
310
311static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
312_mm_cmpunord_ss(__m128 a, __m128 b)
313{
314  return (__m128)__builtin_ia32_cmpss(a, b, 3);
315}
316
317static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
318_mm_cmpunord_ps(__m128 a, __m128 b)
319{
320  return (__m128)__builtin_ia32_cmpps(a, b, 3);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comieq_ss(__m128 a, __m128 b)
325{
326  return __builtin_ia32_comieq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comilt_ss(__m128 a, __m128 b)
331{
332  return __builtin_ia32_comilt(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_comile_ss(__m128 a, __m128 b)
337{
338  return __builtin_ia32_comile(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_comigt_ss(__m128 a, __m128 b)
343{
344  return __builtin_ia32_comigt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_comige_ss(__m128 a, __m128 b)
349{
350  return __builtin_ia32_comige(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_comineq_ss(__m128 a, __m128 b)
355{
356  return __builtin_ia32_comineq(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomieq_ss(__m128 a, __m128 b)
361{
362  return __builtin_ia32_ucomieq(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomilt_ss(__m128 a, __m128 b)
367{
368  return __builtin_ia32_ucomilt(a, b);
369}
370
371static __inline__ int __attribute__((__always_inline__, __nodebug__))
372_mm_ucomile_ss(__m128 a, __m128 b)
373{
374  return __builtin_ia32_ucomile(a, b);
375}
376
377static __inline__ int __attribute__((__always_inline__, __nodebug__))
378_mm_ucomigt_ss(__m128 a, __m128 b)
379{
380  return __builtin_ia32_ucomigt(a, b);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_ucomige_ss(__m128 a, __m128 b)
385{
386  return __builtin_ia32_ucomige(a, b);
387}
388
389static __inline__ int __attribute__((__always_inline__, __nodebug__))
390_mm_ucomineq_ss(__m128 a, __m128 b)
391{
392  return __builtin_ia32_ucomineq(a, b);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtss_si32(__m128 a)
397{
398  return __builtin_ia32_cvtss2si(a);
399}
400
401static __inline__ int __attribute__((__always_inline__, __nodebug__))
402_mm_cvt_ss2si(__m128 a)
403{
404  return _mm_cvtss_si32(a);
405}
406
407#ifdef __x86_64__
408
409static __inline__ long long __attribute__((__always_inline__, __nodebug__))
410_mm_cvtss_si64(__m128 a)
411{
412  return __builtin_ia32_cvtss2si64(a);
413}
414
415#endif
416
417static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
418_mm_cvtps_pi32(__m128 a)
419{
420  return (__m64)__builtin_ia32_cvtps2pi(a);
421}
422
423static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
424_mm_cvt_ps2pi(__m128 a)
425{
426  return _mm_cvtps_pi32(a);
427}
428
429static __inline__ int __attribute__((__always_inline__, __nodebug__))
430_mm_cvttss_si32(__m128 a)
431{
432  return a[0];
433}
434
435static __inline__ int __attribute__((__always_inline__, __nodebug__))
436_mm_cvtt_ss2si(__m128 a)
437{
438  return _mm_cvttss_si32(a);
439}
440
441static __inline__ long long __attribute__((__always_inline__, __nodebug__))
442_mm_cvttss_si64(__m128 a)
443{
444  return a[0];
445}
446
447static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
448_mm_cvttps_pi32(__m128 a)
449{
450  return (__m64)__builtin_ia32_cvttps2pi(a);
451}
452
453static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
454_mm_cvtt_ps2pi(__m128 a)
455{
456  return _mm_cvttps_pi32(a);
457}
458
459static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
460_mm_cvtsi32_ss(__m128 a, int b)
461{
462  a[0] = b;
463  return a;
464}
465
466static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
467_mm_cvt_si2ss(__m128 a, int b)
468{
469  return _mm_cvtsi32_ss(a, b);
470}
471
472#ifdef __x86_64__
473
474static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475_mm_cvtsi64_ss(__m128 a, long long b)
476{
477  a[0] = b;
478  return a;
479}
480
481#endif
482
483static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
484_mm_cvtpi32_ps(__m128 a, __m64 b)
485{
486  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
487}
488
489static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
490_mm_cvt_pi2ps(__m128 a, __m64 b)
491{
492  return _mm_cvtpi32_ps(a, b);
493}
494
495static __inline__ float __attribute__((__always_inline__, __nodebug__))
496_mm_cvtss_f32(__m128 a)
497{
498  return a[0];
499}
500
501static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
502_mm_loadh_pi(__m128 a, const __m64 *p)
503{
504  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
505  struct __mm_loadh_pi_struct {
506    __mm_loadh_pi_v2f32 u;
507  } __attribute__((__packed__, __may_alias__));
508  __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
509  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
510  return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
511}
512
513static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
514_mm_loadl_pi(__m128 a, const __m64 *p)
515{
516  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
517  struct __mm_loadl_pi_struct {
518    __mm_loadl_pi_v2f32 u;
519  } __attribute__((__packed__, __may_alias__));
520  __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
521  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
522  return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
523}
524
525static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
526_mm_load_ss(const float *p)
527{
528  struct __mm_load_ss_struct {
529    float u;
530  } __attribute__((__packed__, __may_alias__));
531  float u = ((struct __mm_load_ss_struct*)p)->u;
532  return (__m128){ u, 0, 0, 0 };
533}
534
535static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
536_mm_load1_ps(const float *p)
537{
538  struct __mm_load1_ps_struct {
539    float u;
540  } __attribute__((__packed__, __may_alias__));
541  float u = ((struct __mm_load1_ps_struct*)p)->u;
542  return (__m128){ u, u, u, u };
543}
544
545#define        _mm_load_ps1(p) _mm_load1_ps(p)
546
547static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
548_mm_load_ps(const float *p)
549{
550  return *(__m128*)p;
551}
552
553static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
554_mm_loadu_ps(const float *p)
555{
556  struct __loadu_ps {
557    __m128 v;
558  } __attribute__((__packed__, __may_alias__));
559  return ((struct __loadu_ps*)p)->v;
560}
561
562static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
563_mm_loadr_ps(const float *p)
564{
565  __m128 a = _mm_load_ps(p);
566  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
567}
568
569static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
570_mm_set_ss(float w)
571{
572  return (__m128){ w, 0, 0, 0 };
573}
574
575static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
576_mm_set1_ps(float w)
577{
578  return (__m128){ w, w, w, w };
579}
580
581// Microsoft specific.
582static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
583_mm_set_ps1(float w)
584{
585    return _mm_set1_ps(w);
586}
587
588static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
589_mm_set_ps(float z, float y, float x, float w)
590{
591  return (__m128){ w, x, y, z };
592}
593
594static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
595_mm_setr_ps(float z, float y, float x, float w)
596{
597  return (__m128){ z, y, x, w };
598}
599
600static __inline__ __m128 __attribute__((__always_inline__))
601_mm_setzero_ps(void)
602{
603  return (__m128){ 0, 0, 0, 0 };
604}
605
606static __inline__ void __attribute__((__always_inline__))
607_mm_storeh_pi(__m64 *p, __m128 a)
608{
609  __builtin_ia32_storehps((__v2si *)p, a);
610}
611
612static __inline__ void __attribute__((__always_inline__))
613_mm_storel_pi(__m64 *p, __m128 a)
614{
615  __builtin_ia32_storelps((__v2si *)p, a);
616}
617
618static __inline__ void __attribute__((__always_inline__))
619_mm_store_ss(float *p, __m128 a)
620{
621  struct __mm_store_ss_struct {
622    float u;
623  } __attribute__((__packed__, __may_alias__));
624  ((struct __mm_store_ss_struct*)p)->u = a[0];
625}
626
627static __inline__ void __attribute__((__always_inline__, __nodebug__))
628_mm_storeu_ps(float *p, __m128 a)
629{
630  __builtin_ia32_storeups(p, a);
631}
632
633static __inline__ void __attribute__((__always_inline__, __nodebug__))
634_mm_store1_ps(float *p, __m128 a)
635{
636  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
637  _mm_storeu_ps(p, a);
638}
639
640static __inline__ void __attribute__((__always_inline__, __nodebug__))
641_mm_store_ps1(float *p, __m128 a)
642{
643    return _mm_store1_ps(p, a);
644}
645
646static __inline__ void __attribute__((__always_inline__, __nodebug__))
647_mm_store_ps(float *p, __m128 a)
648{
649  *(__m128 *)p = a;
650}
651
652static __inline__ void __attribute__((__always_inline__, __nodebug__))
653_mm_storer_ps(float *p, __m128 a)
654{
655  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
656  _mm_store_ps(p, a);
657}
658
659#define _MM_HINT_T0 3
660#define _MM_HINT_T1 2
661#define _MM_HINT_T2 1
662#define _MM_HINT_NTA 0
663
664/* FIXME: We have to #define this because "sel" must be a constant integer, and
665   Sema doesn't do any form of constant propagation yet. */
666
667#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
668
669static __inline__ void __attribute__((__always_inline__, __nodebug__))
670_mm_stream_pi(__m64 *p, __m64 a)
671{
672  __builtin_ia32_movntq(p, a);
673}
674
675static __inline__ void __attribute__((__always_inline__, __nodebug__))
676_mm_stream_ps(float *p, __m128 a)
677{
678  __builtin_ia32_movntps(p, a);
679}
680
681static __inline__ void __attribute__((__always_inline__, __nodebug__))
682_mm_sfence(void)
683{
684  __builtin_ia32_sfence();
685}
686
687static __inline__ int __attribute__((__always_inline__, __nodebug__))
688_mm_extract_pi16(__m64 a, int n)
689{
690  __v4hi b = (__v4hi)a;
691  return (unsigned short)b[n & 3];
692}
693
694static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
695_mm_insert_pi16(__m64 a, int d, int n)
696{
697   __v4hi b = (__v4hi)a;
698   b[n & 3] = d;
699   return (__m64)b;
700}
701
702static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
703_mm_max_pi16(__m64 a, __m64 b)
704{
705  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
706}
707
708static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
709_mm_max_pu8(__m64 a, __m64 b)
710{
711  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
712}
713
714static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
715_mm_min_pi16(__m64 a, __m64 b)
716{
717  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
718}
719
720static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
721_mm_min_pu8(__m64 a, __m64 b)
722{
723  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
724}
725
726static __inline__ int __attribute__((__always_inline__, __nodebug__))
727_mm_movemask_pi8(__m64 a)
728{
729  return __builtin_ia32_pmovmskb((__v8qi)a);
730}
731
732static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733_mm_mulhi_pu16(__m64 a, __m64 b)
734{
735  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
736}
737
738#define _mm_shuffle_pi16(a, n) __extension__ ({ \
739  __m64 __a = (a); \
740  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
741
742static __inline__ void __attribute__((__always_inline__, __nodebug__))
743_mm_maskmove_si64(__m64 d, __m64 n, char *p)
744{
745  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
746}
747
748static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
749_mm_avg_pu8(__m64 a, __m64 b)
750{
751  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
752}
753
754static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
755_mm_avg_pu16(__m64 a, __m64 b)
756{
757  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
758}
759
760static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
761_mm_sad_pu8(__m64 a, __m64 b)
762{
763  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
764}
765
766static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
767_mm_getcsr(void)
768{
769  return __builtin_ia32_stmxcsr();
770}
771
772static __inline__ void __attribute__((__always_inline__, __nodebug__))
773_mm_setcsr(unsigned int i)
774{
775  __builtin_ia32_ldmxcsr(i);
776}
777
778#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
779  __m128 __a = (a); \
780  __m128 __b = (b); \
781  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
782                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
783                                  (((mask) & 0x30) >> 4) + 4, \
784                                  (((mask) & 0xc0) >> 6) + 4); })
785
786static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
787_mm_unpackhi_ps(__m128 a, __m128 b)
788{
789  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
790}
791
792static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
793_mm_unpacklo_ps(__m128 a, __m128 b)
794{
795  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
796}
797
798static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
799_mm_move_ss(__m128 a, __m128 b)
800{
801  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
802}
803
804static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
805_mm_movehl_ps(__m128 a, __m128 b)
806{
807  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
808}
809
810static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
811_mm_movelh_ps(__m128 a, __m128 b)
812{
813  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
814}
815
816static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
817_mm_cvtpi16_ps(__m64 a)
818{
819  __m64 b, c;
820  __m128 r;
821
822  b = _mm_setzero_si64();
823  b = _mm_cmpgt_pi16(b, a);
824  c = _mm_unpackhi_pi16(a, b);
825  r = _mm_setzero_ps();
826  r = _mm_cvtpi32_ps(r, c);
827  r = _mm_movelh_ps(r, r);
828  c = _mm_unpacklo_pi16(a, b);
829  r = _mm_cvtpi32_ps(r, c);
830
831  return r;
832}
833
834static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
835_mm_cvtpu16_ps(__m64 a)
836{
837  __m64 b, c;
838  __m128 r;
839
840  b = _mm_setzero_si64();
841  c = _mm_unpackhi_pi16(a, b);
842  r = _mm_setzero_ps();
843  r = _mm_cvtpi32_ps(r, c);
844  r = _mm_movelh_ps(r, r);
845  c = _mm_unpacklo_pi16(a, b);
846  r = _mm_cvtpi32_ps(r, c);
847
848  return r;
849}
850
851static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
852_mm_cvtpi8_ps(__m64 a)
853{
854  __m64 b;
855
856  b = _mm_setzero_si64();
857  b = _mm_cmpgt_pi8(b, a);
858  b = _mm_unpacklo_pi8(a, b);
859
860  return _mm_cvtpi16_ps(b);
861}
862
863static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
864_mm_cvtpu8_ps(__m64 a)
865{
866  __m64 b;
867
868  b = _mm_setzero_si64();
869  b = _mm_unpacklo_pi8(a, b);
870
871  return _mm_cvtpi16_ps(b);
872}
873
874static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
875_mm_cvtpi32x2_ps(__m64 a, __m64 b)
876{
877  __m128 c;
878
879  c = _mm_setzero_ps();
880  c = _mm_cvtpi32_ps(c, b);
881  c = _mm_movelh_ps(c, c);
882
883  return _mm_cvtpi32_ps(c, a);
884}
885
886static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
887_mm_cvtps_pi16(__m128 a)
888{
889  __m64 b, c;
890
891  b = _mm_cvtps_pi32(a);
892  a = _mm_movehl_ps(a, a);
893  c = _mm_cvtps_pi32(a);
894
895  return _mm_packs_pi16(b, c);
896}
897
898static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
899_mm_cvtps_pi8(__m128 a)
900{
901  __m64 b, c;
902
903  b = _mm_cvtps_pi16(a);
904  c = _mm_setzero_si64();
905
906  return _mm_packs_pi16(b, c);
907}
908
909static __inline__ int __attribute__((__always_inline__, __nodebug__))
910_mm_movemask_ps(__m128 a)
911{
912  return __builtin_ia32_movmskps(a);
913}
914
915#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
916
917#define _MM_EXCEPT_INVALID    (0x0001)
918#define _MM_EXCEPT_DENORM     (0x0002)
919#define _MM_EXCEPT_DIV_ZERO   (0x0004)
920#define _MM_EXCEPT_OVERFLOW   (0x0008)
921#define _MM_EXCEPT_UNDERFLOW  (0x0010)
922#define _MM_EXCEPT_INEXACT    (0x0020)
923#define _MM_EXCEPT_MASK       (0x003f)
924
925#define _MM_MASK_INVALID      (0x0080)
926#define _MM_MASK_DENORM       (0x0100)
927#define _MM_MASK_DIV_ZERO     (0x0200)
928#define _MM_MASK_OVERFLOW     (0x0400)
929#define _MM_MASK_UNDERFLOW    (0x0800)
930#define _MM_MASK_INEXACT      (0x1000)
931#define _MM_MASK_MASK         (0x1f80)
932
933#define _MM_ROUND_NEAREST     (0x0000)
934#define _MM_ROUND_DOWN        (0x2000)
935#define _MM_ROUND_UP          (0x4000)
936#define _MM_ROUND_TOWARD_ZERO (0x6000)
937#define _MM_ROUND_MASK        (0x6000)
938
939#define _MM_FLUSH_ZERO_MASK   (0x8000)
940#define _MM_FLUSH_ZERO_ON     (0x8000)
941#define _MM_FLUSH_ZERO_OFF    (0x0000)
942
943#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
944#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
945#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
946#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
947
948#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
949#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
950#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
951#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
952
953#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
954do { \
955  __m128 tmp3, tmp2, tmp1, tmp0; \
956  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
957  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
958  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
959  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
960  (row0) = _mm_movelh_ps(tmp0, tmp2); \
961  (row1) = _mm_movehl_ps(tmp2, tmp0); \
962  (row2) = _mm_movelh_ps(tmp1, tmp3); \
963  (row3) = _mm_movehl_ps(tmp3, tmp1); \
964} while (0)
965
966/* Aliases for compatibility. */
967#define _m_pextrw _mm_extract_pi16
968#define _m_pinsrw _mm_insert_pi16
969#define _m_pmaxsw _mm_max_pi16
970#define _m_pmaxub _mm_max_pu8
971#define _m_pminsw _mm_min_pi16
972#define _m_pminub _mm_min_pu8
973#define _m_pmovmskb _mm_movemask_pi8
974#define _m_pmulhuw _mm_mulhi_pu16
975#define _m_pshufw _mm_shuffle_pi16
976#define _m_maskmovq _mm_maskmove_si64
977#define _m_pavgb _mm_avg_pu8
978#define _m_pavgw _mm_avg_pu16
979#define _m_psadbw _mm_sad_pu8
980#define _m_ _mm_
981#define _m_ _mm_
982
983/* Ugly hack for backwards-compatibility (compatible with gcc) */
984#ifdef __SSE2__
985#include <emmintrin.h>
986#endif
987
988#endif /* __SSE__ */
989
990#endif /* __XMMINTRIN_H */
991