1/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef _XMMINTRIN_H_INCLUDED
28#define _XMMINTRIN_H_INCLUDED
29
30#ifndef __SSE__
31# error "SSE instruction set not enabled"
32#else
33
34/* We need type definitions from the MMX header file.  */
35#include <mmintrin.h>
36
37/* Get _mm_malloc () and _mm_free ().  */
38#include <mm_malloc.h>
39
40/* The Intel API is flexible enough that we must allow aliasing with other
41   vector types, and their scalar components.  */
42typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
43
44/* Internal data types for implementing the intrinsics.  */
45typedef float __v4sf __attribute__ ((__vector_size__ (16)));
46
47/* Create a selector for use with the SHUFPS instruction.  */
48#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
49 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
50
51/* Constants for use with _mm_prefetch.  */
52enum _mm_hint
53{
54  _MM_HINT_T0 = 3,
55  _MM_HINT_T1 = 2,
56  _MM_HINT_T2 = 1,
57  _MM_HINT_NTA = 0
58};
59
60/* Bits in the MXCSR.  */
61#define _MM_EXCEPT_MASK       0x003f
62#define _MM_EXCEPT_INVALID    0x0001
63#define _MM_EXCEPT_DENORM     0x0002
64#define _MM_EXCEPT_DIV_ZERO   0x0004
65#define _MM_EXCEPT_OVERFLOW   0x0008
66#define _MM_EXCEPT_UNDERFLOW  0x0010
67#define _MM_EXCEPT_INEXACT    0x0020
68
69#define _MM_MASK_MASK         0x1f80
70#define _MM_MASK_INVALID      0x0080
71#define _MM_MASK_DENORM       0x0100
72#define _MM_MASK_DIV_ZERO     0x0200
73#define _MM_MASK_OVERFLOW     0x0400
74#define _MM_MASK_UNDERFLOW    0x0800
75#define _MM_MASK_INEXACT      0x1000
76
77#define _MM_ROUND_MASK        0x6000
78#define _MM_ROUND_NEAREST     0x0000
79#define _MM_ROUND_DOWN        0x2000
80#define _MM_ROUND_UP          0x4000
81#define _MM_ROUND_TOWARD_ZERO 0x6000
82
83#define _MM_FLUSH_ZERO_MASK   0x8000
84#define _MM_FLUSH_ZERO_ON     0x8000
85#define _MM_FLUSH_ZERO_OFF    0x0000
86
87/* Create a vector of zeros.  */
88extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89_mm_setzero_ps (void)
90{
91  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
92}
93
94/* Perform the respective operation on the lower SPFP (single-precision
95   floating-point) values of A and B; the upper three SPFP values are
96   passed through from A.  */
97
98extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
99_mm_add_ss (__m128 __A, __m128 __B)
100{
101  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
102}
103
104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_sub_ss (__m128 __A, __m128 __B)
106{
107  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
108}
109
110extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111_mm_mul_ss (__m128 __A, __m128 __B)
112{
113  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
114}
115
116extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117_mm_div_ss (__m128 __A, __m128 __B)
118{
119  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
120}
121
122extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123_mm_sqrt_ss (__m128 __A)
124{
125  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
126}
127
128extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129_mm_rcp_ss (__m128 __A)
130{
131  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
132}
133
134extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135_mm_rsqrt_ss (__m128 __A)
136{
137  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
138}
139
140extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141_mm_min_ss (__m128 __A, __m128 __B)
142{
143  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
144}
145
146extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_max_ss (__m128 __A, __m128 __B)
148{
149  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
150}
151
152/* Perform the respective operation on the four SPFP values in A and B.  */
153
154extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_add_ps (__m128 __A, __m128 __B)
156{
157  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
158}
159
160extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161_mm_sub_ps (__m128 __A, __m128 __B)
162{
163  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
164}
165
166extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167_mm_mul_ps (__m128 __A, __m128 __B)
168{
169  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
170}
171
172extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173_mm_div_ps (__m128 __A, __m128 __B)
174{
175  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
176}
177
178extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179_mm_sqrt_ps (__m128 __A)
180{
181  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
182}
183
184extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
185_mm_rcp_ps (__m128 __A)
186{
187  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
188}
189
190extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191_mm_rsqrt_ps (__m128 __A)
192{
193  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
194}
195
196extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
197_mm_min_ps (__m128 __A, __m128 __B)
198{
199  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
200}
201
202extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm_max_ps (__m128 __A, __m128 __B)
204{
205  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
206}
207
208/* Perform logical bit-wise operations on 128-bit values.  */
209
210extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211_mm_and_ps (__m128 __A, __m128 __B)
212{
213  return __builtin_ia32_andps (__A, __B);
214}
215
216extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
217_mm_andnot_ps (__m128 __A, __m128 __B)
218{
219  return __builtin_ia32_andnps (__A, __B);
220}
221
222extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_mm_or_ps (__m128 __A, __m128 __B)
224{
225  return __builtin_ia32_orps (__A, __B);
226}
227
228extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229_mm_xor_ps (__m128 __A, __m128 __B)
230{
231  return __builtin_ia32_xorps (__A, __B);
232}
233
234/* Perform a comparison on the lower SPFP values of A and B.  If the
235   comparison is true, place a mask of all ones in the result, otherwise a
236   mask of zeros.  The upper three SPFP values are passed through from A.  */
237
238extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239_mm_cmpeq_ss (__m128 __A, __m128 __B)
240{
241  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
242}
243
244extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245_mm_cmplt_ss (__m128 __A, __m128 __B)
246{
247  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
248}
249
250extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251_mm_cmple_ss (__m128 __A, __m128 __B)
252{
253  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
254}
255
256extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257_mm_cmpgt_ss (__m128 __A, __m128 __B)
258{
259  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
260					(__v4sf)
261					__builtin_ia32_cmpltss ((__v4sf) __B,
262								(__v4sf)
263								__A));
264}
265
266extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267_mm_cmpge_ss (__m128 __A, __m128 __B)
268{
269  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
270					(__v4sf)
271					__builtin_ia32_cmpless ((__v4sf) __B,
272								(__v4sf)
273								__A));
274}
275
276extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277_mm_cmpneq_ss (__m128 __A, __m128 __B)
278{
279  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
280}
281
282extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283_mm_cmpnlt_ss (__m128 __A, __m128 __B)
284{
285  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
286}
287
288extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_cmpnle_ss (__m128 __A, __m128 __B)
290{
291  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
292}
293
294extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
295_mm_cmpngt_ss (__m128 __A, __m128 __B)
296{
297  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
298					(__v4sf)
299					__builtin_ia32_cmpnltss ((__v4sf) __B,
300								 (__v4sf)
301								 __A));
302}
303
304extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm_cmpnge_ss (__m128 __A, __m128 __B)
306{
307  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
308					(__v4sf)
309					__builtin_ia32_cmpnless ((__v4sf) __B,
310								 (__v4sf)
311								 __A));
312}
313
314extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315_mm_cmpord_ss (__m128 __A, __m128 __B)
316{
317  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
318}
319
320extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm_cmpunord_ss (__m128 __A, __m128 __B)
322{
323  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
324}
325
326/* Perform a comparison on the four SPFP values of A and B.  For each
327   element, if the comparison is true, place a mask of all ones in the
328   result, otherwise a mask of zeros.  */
329
330extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331_mm_cmpeq_ps (__m128 __A, __m128 __B)
332{
333  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
334}
335
336extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337_mm_cmplt_ps (__m128 __A, __m128 __B)
338{
339  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
340}
341
342extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343_mm_cmple_ps (__m128 __A, __m128 __B)
344{
345  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
346}
347
348extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349_mm_cmpgt_ps (__m128 __A, __m128 __B)
350{
351  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
352}
353
354extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355_mm_cmpge_ps (__m128 __A, __m128 __B)
356{
357  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
358}
359
360extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361_mm_cmpneq_ps (__m128 __A, __m128 __B)
362{
363  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
364}
365
366extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367_mm_cmpnlt_ps (__m128 __A, __m128 __B)
368{
369  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
370}
371
372extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373_mm_cmpnle_ps (__m128 __A, __m128 __B)
374{
375  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
376}
377
378extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379_mm_cmpngt_ps (__m128 __A, __m128 __B)
380{
381  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
382}
383
384extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385_mm_cmpnge_ps (__m128 __A, __m128 __B)
386{
387  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
388}
389
390extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391_mm_cmpord_ps (__m128 __A, __m128 __B)
392{
393  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
394}
395
396extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
397_mm_cmpunord_ps (__m128 __A, __m128 __B)
398{
399  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
400}
401
402/* Compare the lower SPFP values of A and B and return 1 if true
403   and 0 if false.  */
404
405extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406_mm_comieq_ss (__m128 __A, __m128 __B)
407{
408  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
409}
410
411extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412_mm_comilt_ss (__m128 __A, __m128 __B)
413{
414  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
415}
416
417extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
418_mm_comile_ss (__m128 __A, __m128 __B)
419{
420  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
421}
422
423extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424_mm_comigt_ss (__m128 __A, __m128 __B)
425{
426  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
427}
428
429extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
430_mm_comige_ss (__m128 __A, __m128 __B)
431{
432  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
433}
434
435extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436_mm_comineq_ss (__m128 __A, __m128 __B)
437{
438  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
439}
440
441extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442_mm_ucomieq_ss (__m128 __A, __m128 __B)
443{
444  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
445}
446
447extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448_mm_ucomilt_ss (__m128 __A, __m128 __B)
449{
450  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
451}
452
453extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454_mm_ucomile_ss (__m128 __A, __m128 __B)
455{
456  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
457}
458
459extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460_mm_ucomigt_ss (__m128 __A, __m128 __B)
461{
462  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
463}
464
465extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466_mm_ucomige_ss (__m128 __A, __m128 __B)
467{
468  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
469}
470
471extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472_mm_ucomineq_ss (__m128 __A, __m128 __B)
473{
474  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
475}
476
477/* Convert the lower SPFP value to a 32-bit integer according to the current
478   rounding mode.  */
479extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480_mm_cvtss_si32 (__m128 __A)
481{
482  return __builtin_ia32_cvtss2si ((__v4sf) __A);
483}
484
485extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486_mm_cvt_ss2si (__m128 __A)
487{
488  return _mm_cvtss_si32 (__A);
489}
490
491#ifdef __x86_64__
492/* Convert the lower SPFP value to a 32-bit integer according to the
493   current rounding mode.  */
494
495/* Intel intrinsic.  */
496extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_mm_cvtss_si64 (__m128 __A)
498{
499  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
500}
501
502/* Microsoft intrinsic.  */
503extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504_mm_cvtss_si64x (__m128 __A)
505{
506  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
507}
508#endif
509
510/* Convert the two lower SPFP values to 32-bit integers according to the
511   current rounding mode.  Return the integers in packed form.  */
512extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_mm_cvtps_pi32 (__m128 __A)
514{
515  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
516}
517
518extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519_mm_cvt_ps2pi (__m128 __A)
520{
521  return _mm_cvtps_pi32 (__A);
522}
523
524/* Truncate the lower SPFP value to a 32-bit integer.  */
525extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526_mm_cvttss_si32 (__m128 __A)
527{
528  return __builtin_ia32_cvttss2si ((__v4sf) __A);
529}
530
531extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532_mm_cvtt_ss2si (__m128 __A)
533{
534  return _mm_cvttss_si32 (__A);
535}
536
537#ifdef __x86_64__
538/* Truncate the lower SPFP value to a 32-bit integer.  */
539
540/* Intel intrinsic.  */
541extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542_mm_cvttss_si64 (__m128 __A)
543{
544  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
545}
546
547/* Microsoft intrinsic.  */
548extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549_mm_cvttss_si64x (__m128 __A)
550{
551  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
552}
553#endif
554
555/* Truncate the two lower SPFP values to 32-bit integers.  Return the
556   integers in packed form.  */
557extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558_mm_cvttps_pi32 (__m128 __A)
559{
560  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
561}
562
563extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
564_mm_cvtt_ps2pi (__m128 __A)
565{
566  return _mm_cvttps_pi32 (__A);
567}
568
569/* Convert B to a SPFP value and insert it as element zero in A.  */
570extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
571_mm_cvtsi32_ss (__m128 __A, int __B)
572{
573  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
574}
575
576extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577_mm_cvt_si2ss (__m128 __A, int __B)
578{
579  return _mm_cvtsi32_ss (__A, __B);
580}
581
582#ifdef __x86_64__
583/* Convert B to a SPFP value and insert it as element zero in A.  */
584
585/* Intel intrinsic.  */
586extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587_mm_cvtsi64_ss (__m128 __A, long long __B)
588{
589  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
590}
591
592/* Microsoft intrinsic.  */
593extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
594_mm_cvtsi64x_ss (__m128 __A, long long __B)
595{
596  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
597}
598#endif
599
600/* Convert the two 32-bit values in B to SPFP form and insert them
601   as the two lower elements in A.  */
602extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603_mm_cvtpi32_ps (__m128 __A, __m64 __B)
604{
605  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
606}
607
608extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609_mm_cvt_pi2ps (__m128 __A, __m64 __B)
610{
611  return _mm_cvtpi32_ps (__A, __B);
612}
613
614/* Convert the four signed 16-bit values in A to SPFP form.  */
615extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616_mm_cvtpi16_ps (__m64 __A)
617{
618  __v4hi __sign;
619  __v2si __hisi, __losi;
620  __v4sf __zero, __ra, __rb;
621
622  /* This comparison against zero gives us a mask that can be used to
623     fill in the missing sign bits in the unpack operations below, so
624     that we get signed values after unpacking.  */
625  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
626
627  /* Convert the four words to doublewords.  */
628  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
629  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
630
631  /* Convert the doublewords to floating point two at a time.  */
632  __zero = (__v4sf) _mm_setzero_ps ();
633  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
634  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
635
636  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
637}
638
639/* Convert the four unsigned 16-bit values in A to SPFP form.  */
640extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641_mm_cvtpu16_ps (__m64 __A)
642{
643  __v2si __hisi, __losi;
644  __v4sf __zero, __ra, __rb;
645
646  /* Convert the four words to doublewords.  */
647  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
648  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
649
650  /* Convert the doublewords to floating point two at a time.  */
651  __zero = (__v4sf) _mm_setzero_ps ();
652  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
653  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
654
655  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
656}
657
658/* Convert the low four signed 8-bit values in A to SPFP form.  */
659extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660_mm_cvtpi8_ps (__m64 __A)
661{
662  __v8qi __sign;
663
664  /* This comparison against zero gives us a mask that can be used to
665     fill in the missing sign bits in the unpack operations below, so
666     that we get signed values after unpacking.  */
667  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
668
669  /* Convert the four low bytes to words.  */
670  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
671
672  return _mm_cvtpi16_ps(__A);
673}
674
675/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
676extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677_mm_cvtpu8_ps(__m64 __A)
678{
679  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
680  return _mm_cvtpu16_ps(__A);
681}
682
683/* Convert the four signed 32-bit values in A and B to SPFP form.  */
684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
686{
687  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
688  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
689  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
690  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
691}
692
693/* Convert the four SPFP values in A to four signed 16-bit integers.  */
694extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_cvtps_pi16(__m128 __A)
696{
697  __v4sf __hisf = (__v4sf)__A;
698  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
699  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
700  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
701  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
702}
703
704/* Convert the four SPFP values in A to four signed 8-bit integers.  */
705extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706_mm_cvtps_pi8(__m128 __A)
707{
708  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
709  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
710}
711
712/* Selects four specific SPFP values from A and B based on MASK.  */
713#ifdef __OPTIMIZE__
714extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
716{
717  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
718}
719#else
720#define _mm_shuffle_ps(A, B, MASK)					\
721  ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),			\
722				   (__v4sf)(__m128)(B), (int)(MASK)))
723#endif
724
725/* Selects and interleaves the upper two SPFP values from A and B.  */
726extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
727_mm_unpackhi_ps (__m128 __A, __m128 __B)
728{
729  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
730}
731
732/* Selects and interleaves the lower two SPFP values from A and B.  */
733extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_unpacklo_ps (__m128 __A, __m128 __B)
735{
736  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
737}
738
739/* Sets the upper two SPFP values with 64-bits of data loaded from P;
740   the lower two values are passed through from A.  */
741extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742_mm_loadh_pi (__m128 __A, __m64 const *__P)
743{
744  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
745}
746
747/* Stores the upper two SPFP values of A into P.  */
748extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749_mm_storeh_pi (__m64 *__P, __m128 __A)
750{
751  __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
752}
753
754/* Moves the upper two values of B into the lower two values of A.  */
755extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
756_mm_movehl_ps (__m128 __A, __m128 __B)
757{
758  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
759}
760
761/* Moves the lower two values of B into the upper two values of A.  */
762extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
763_mm_movelh_ps (__m128 __A, __m128 __B)
764{
765  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
766}
767
768/* Sets the lower two SPFP values with 64-bits of data loaded from P;
769   the upper two values are passed through from A.  */
770extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771_mm_loadl_pi (__m128 __A, __m64 const *__P)
772{
773  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
774}
775
776/* Stores the lower two SPFP values of A into P.  */
777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778_mm_storel_pi (__m64 *__P, __m128 __A)
779{
780  __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
781}
782
783/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
784extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785_mm_movemask_ps (__m128 __A)
786{
787  return __builtin_ia32_movmskps ((__v4sf)__A);
788}
789
790/* Return the contents of the control register.  */
791extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792_mm_getcsr (void)
793{
794  return __builtin_ia32_stmxcsr ();
795}
796
797/* Read exception bits from the control register.  */
798extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
799_MM_GET_EXCEPTION_STATE (void)
800{
801  return _mm_getcsr() & _MM_EXCEPT_MASK;
802}
803
804extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805_MM_GET_EXCEPTION_MASK (void)
806{
807  return _mm_getcsr() & _MM_MASK_MASK;
808}
809
810extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811_MM_GET_ROUNDING_MODE (void)
812{
813  return _mm_getcsr() & _MM_ROUND_MASK;
814}
815
816extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817_MM_GET_FLUSH_ZERO_MODE (void)
818{
819  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
820}
821
822/* Set the control register to I.  */
823extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm_setcsr (unsigned int __I)
825{
826  __builtin_ia32_ldmxcsr (__I);
827}
828
829/* Set exception bits in the control register.  */
830extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831_MM_SET_EXCEPTION_STATE(unsigned int __mask)
832{
833  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
834}
835
836extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837_MM_SET_EXCEPTION_MASK (unsigned int __mask)
838{
839  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
840}
841
842extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843_MM_SET_ROUNDING_MODE (unsigned int __mode)
844{
845  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
846}
847
848extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
849_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
850{
851  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
852}
853
854/* Create a vector with element 0 as F and the rest zero.  */
855extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
856_mm_set_ss (float __F)
857{
858  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
859}
860
861/* Create a vector with all four elements equal to F.  */
862extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863_mm_set1_ps (float __F)
864{
865  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
866}
867
868extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869_mm_set_ps1 (float __F)
870{
871  return _mm_set1_ps (__F);
872}
873
874/* Create a vector with element 0 as *P and the rest zero.  */
875extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876_mm_load_ss (float const *__P)
877{
878  return _mm_set_ss (*__P);
879}
880
881/* Create a vector with all four elements equal to *P.  */
882extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883_mm_load1_ps (float const *__P)
884{
885  return _mm_set1_ps (*__P);
886}
887
888extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889_mm_load_ps1 (float const *__P)
890{
891  return _mm_load1_ps (__P);
892}
893
894/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
895extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm_load_ps (float const *__P)
897{
898  return (__m128) *(__v4sf *)__P;
899}
900
901/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
902extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903_mm_loadu_ps (float const *__P)
904{
905  return (__m128) __builtin_ia32_loadups (__P);
906}
907
908/* Load four SPFP values in reverse order.  The address must be aligned.  */
909extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
910_mm_loadr_ps (float const *__P)
911{
912  __v4sf __tmp = *(__v4sf *)__P;
913  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
914}
915
916/* Create the vector [Z Y X W].  */
917extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
918_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
919{
920  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
921}
922
923/* Create the vector [W X Y Z].  */
924extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
925_mm_setr_ps (float __Z, float __Y, float __X, float __W)
926{
927  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
928}
929
930/* Stores the lower SPFP value.  */
931extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932_mm_store_ss (float *__P, __m128 __A)
933{
934  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
935}
936
937extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938_mm_cvtss_f32 (__m128 __A)
939{
940  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
941}
942
943/* Store four SPFP values.  The address must be 16-byte aligned.  */
944extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945_mm_store_ps (float *__P, __m128 __A)
946{
947  *(__v4sf *)__P = (__v4sf)__A;
948}
949
950/* Store four SPFP values.  The address need not be 16-byte aligned.  */
951extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952_mm_storeu_ps (float *__P, __m128 __A)
953{
954  __builtin_ia32_storeups (__P, (__v4sf)__A);
955}
956
957/* Store the lower SPFP value across four words.  */
958extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959_mm_store1_ps (float *__P, __m128 __A)
960{
961  __v4sf __va = (__v4sf)__A;
962  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
963  _mm_storeu_ps (__P, __tmp);
964}
965
966extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967_mm_store_ps1 (float *__P, __m128 __A)
968{
969  _mm_store1_ps (__P, __A);
970}
971
972/* Store four SPFP values in reverse order.  The address must be aligned.  */
973extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
974_mm_storer_ps (float *__P, __m128 __A)
975{
976  __v4sf __va = (__v4sf)__A;
977  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
978  _mm_store_ps (__P, __tmp);
979}
980
981/* Sets the low SPFP value of A from the low value of B.  */
982extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_move_ss (__m128 __A, __m128 __B)
984{
985  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
986}
987
988/* Extracts one of the four words of A.  The selector N must be immediate.  */
989#ifdef __OPTIMIZE__
990extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
991_mm_extract_pi16 (__m64 const __A, int const __N)
992{
993  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
994}
995
996extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997_m_pextrw (__m64 const __A, int const __N)
998{
999  return _mm_extract_pi16 (__A, __N);
1000}
1001#else
1002#define _mm_extract_pi16(A, N)	\
1003  ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
1004
1005#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
1006#endif
1007
1008/* Inserts word D into one of four words of A.  The selector N must be
1009   immediate.  */
1010#ifdef __OPTIMIZE__
1011extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1013{
1014  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
1015}
1016
1017extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018_m_pinsrw (__m64 const __A, int const __D, int const __N)
1019{
1020  return _mm_insert_pi16 (__A, __D, __N);
1021}
1022#else
1023#define _mm_insert_pi16(A, D, N)				\
1024  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),	\
1025					(int)(D), (int)(N)))
1026
1027#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
1028#endif
1029
1030/* Compute the element-wise maximum of signed 16-bit values.  */
1031extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032_mm_max_pi16 (__m64 __A, __m64 __B)
1033{
1034  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1035}
1036
1037extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1038_m_pmaxsw (__m64 __A, __m64 __B)
1039{
1040  return _mm_max_pi16 (__A, __B);
1041}
1042
1043/* Compute the element-wise maximum of unsigned 8-bit values.  */
1044extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045_mm_max_pu8 (__m64 __A, __m64 __B)
1046{
1047  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1048}
1049
1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_m_pmaxub (__m64 __A, __m64 __B)
1052{
1053  return _mm_max_pu8 (__A, __B);
1054}
1055
1056/* Compute the element-wise minimum of signed 16-bit values.  */
1057extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1058_mm_min_pi16 (__m64 __A, __m64 __B)
1059{
1060  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1061}
1062
1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064_m_pminsw (__m64 __A, __m64 __B)
1065{
1066  return _mm_min_pi16 (__A, __B);
1067}
1068
1069/* Compute the element-wise minimum of unsigned 8-bit values.  */
1070extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071_mm_min_pu8 (__m64 __A, __m64 __B)
1072{
1073  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1074}
1075
1076extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1077_m_pminub (__m64 __A, __m64 __B)
1078{
1079  return _mm_min_pu8 (__A, __B);
1080}
1081
1082/* Create an 8-bit mask of the signs of 8-bit values.  */
1083extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084_mm_movemask_pi8 (__m64 __A)
1085{
1086  return __builtin_ia32_pmovmskb ((__v8qi)__A);
1087}
1088
1089extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090_m_pmovmskb (__m64 __A)
1091{
1092  return _mm_movemask_pi8 (__A);
1093}
1094
1095/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1096   in B and produce the high 16 bits of the 32-bit results.  */
1097extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1099{
1100  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1101}
1102
1103extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104_m_pmulhuw (__m64 __A, __m64 __B)
1105{
1106  return _mm_mulhi_pu16 (__A, __B);
1107}
1108
1109/* Return a combination of the four 16-bit values in A.  The selector
1110   must be an immediate.  */
1111#ifdef __OPTIMIZE__
1112extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113_mm_shuffle_pi16 (__m64 __A, int const __N)
1114{
1115  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1116}
1117
1118extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119_m_pshufw (__m64 __A, int const __N)
1120{
1121  return _mm_shuffle_pi16 (__A, __N);
1122}
1123#else
1124#define _mm_shuffle_pi16(A, N) \
1125  ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
1126
1127#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
1128#endif
1129
1130/* Conditionally store byte elements of A into P.  The high bit of each
1131   byte in the selector N determines whether the corresponding byte from
1132   A is stored.  */
1133extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1135{
1136  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1137}
1138
1139extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1141{
1142  _mm_maskmove_si64 (__A, __N, __P);
1143}
1144
1145/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1146extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm_avg_pu8 (__m64 __A, __m64 __B)
1148{
1149  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1150}
1151
1152extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153_m_pavgb (__m64 __A, __m64 __B)
1154{
1155  return _mm_avg_pu8 (__A, __B);
1156}
1157
1158/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1159extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160_mm_avg_pu16 (__m64 __A, __m64 __B)
1161{
1162  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1163}
1164
1165extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166_m_pavgw (__m64 __A, __m64 __B)
1167{
1168  return _mm_avg_pu16 (__A, __B);
1169}
1170
1171/* Compute the sum of the absolute differences of the unsigned 8-bit
1172   values in A and B.  Return the value in the lower 16-bit word; the
1173   upper words are cleared.  */
1174extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175_mm_sad_pu8 (__m64 __A, __m64 __B)
1176{
1177  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1178}
1179
1180extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181_m_psadbw (__m64 __A, __m64 __B)
1182{
1183  return _mm_sad_pu8 (__A, __B);
1184}
1185
1186/* Loads one cache line from address P to a location "closer" to the
1187   processor.  The selector I specifies the type of prefetch operation.  */
1188#ifdef __OPTIMIZE__
1189extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1190_mm_prefetch (const void *__P, enum _mm_hint __I)
1191{
1192  __builtin_prefetch (__P, 0, __I);
1193}
1194#else
1195#define _mm_prefetch(P, I) \
1196  __builtin_prefetch ((P), 0, (I))
1197#endif
1198
1199/* Stores the data in A to the address P without polluting the caches.  */
1200extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1201_mm_stream_pi (__m64 *__P, __m64 __A)
1202{
1203  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1204}
1205
1206/* Likewise.  The address must be 16-byte aligned.  */
1207extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208_mm_stream_ps (float *__P, __m128 __A)
1209{
1210  __builtin_ia32_movntps (__P, (__v4sf)__A);
1211}
1212
1213/* Guarantees that every preceding store is globally visible before
1214   any subsequent store.  */
1215extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216_mm_sfence (void)
1217{
1218  __builtin_ia32_sfence ();
1219}
1220
1221/* The execution of the next instruction is delayed by an implementation
1222   specific amount of time.  The instruction does not modify the
1223   architectural state.  */
1224extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1225_mm_pause (void)
1226{
1227  __builtin_ia32_pause ();
1228}
1229
1230/* Transpose the 4x4 matrix composed of row[0-3].  */
1231#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1232do {									\
1233  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1234  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
1235  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
1236  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
1237  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
1238  (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
1239  (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
1240  (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
1241  (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
1242} while (0)
1243
1244/* For backward source compatibility.  */
1245#ifdef __SSE2__
1246# include <emmintrin.h>
1247#endif
1248
1249#endif /* __SSE__ */
1250#endif /* _XMMINTRIN_H_INCLUDED */
1251