1/* Copyright (C) 2003-2014 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef _EMMINTRIN_H_INCLUDED
28#define _EMMINTRIN_H_INCLUDED
29
30/* We need definitions from the SSE header files*/
31#include <xmmintrin.h>
32
33#ifndef __SSE2__
34#pragma GCC push_options
35#pragma GCC target("sse2")
36#define __DISABLE_SSE2__
37#endif /* __SSE2__ */
38
39/* SSE2 */
40typedef double __v2df __attribute__ ((__vector_size__ (16)));
41typedef long long __v2di __attribute__ ((__vector_size__ (16)));
42typedef int __v4si __attribute__ ((__vector_size__ (16)));
43typedef short __v8hi __attribute__ ((__vector_size__ (16)));
44typedef char __v16qi __attribute__ ((__vector_size__ (16)));
45
46/* The Intel API is flexible enough that we must allow aliasing with other
47   vector types, and their scalar components.  */
48typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
49typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
50
51/* Create a selector for use with the SHUFPD instruction.  */
52#define _MM_SHUFFLE2(fp1,fp0) \
53 (((fp1) << 1) | (fp0))
54
55/* Create a vector with element 0 as F and the rest zero.  */
56extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57_mm_set_sd (double __F)
58{
59  return __extension__ (__m128d){ __F, 0.0 };
60}
61
62/* Create a vector with both elements equal to F.  */
63extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64_mm_set1_pd (double __F)
65{
66  return __extension__ (__m128d){ __F, __F };
67}
68
69extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_mm_set_pd1 (double __F)
71{
72  return _mm_set1_pd (__F);
73}
74
75/* Create a vector with the lower value X and upper value W.  */
76extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77_mm_set_pd (double __W, double __X)
78{
79  return __extension__ (__m128d){ __X, __W };
80}
81
82/* Create a vector with the lower value W and upper value X.  */
83extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84_mm_setr_pd (double __W, double __X)
85{
86  return __extension__ (__m128d){ __W, __X };
87}
88
89/* Create an undefined vector.  */
90extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91_mm_undefined_pd (void)
92{
93  __m128d __Y = __Y;
94  return __Y;
95}
96
97/* Create a vector of zeros.  */
98extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
99_mm_setzero_pd (void)
100{
101  return __extension__ (__m128d){ 0.0, 0.0 };
102}
103
104/* Sets the low DPFP value of A from the low value of B.  */
105extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106_mm_move_sd (__m128d __A, __m128d __B)
107{
108  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
109}
110
111/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
112extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113_mm_load_pd (double const *__P)
114{
115  return *(__m128d *)__P;
116}
117
118/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_loadu_pd (double const *__P)
121{
122  return __builtin_ia32_loadupd (__P);
123}
124
125/* Create a vector with all two elements equal to *P.  */
126extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127_mm_load1_pd (double const *__P)
128{
129  return _mm_set1_pd (*__P);
130}
131
132/* Create a vector with element 0 as *P and the rest zero.  */
133extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134_mm_load_sd (double const *__P)
135{
136  return _mm_set_sd (*__P);
137}
138
139extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_load_pd1 (double const *__P)
141{
142  return _mm_load1_pd (__P);
143}
144
145/* Load two DPFP values in reverse order.  The address must be aligned.  */
146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_loadr_pd (double const *__P)
148{
149  __m128d __tmp = _mm_load_pd (__P);
150  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
151}
152
153/* Store two DPFP values.  The address must be 16-byte aligned.  */
154extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_store_pd (double *__P, __m128d __A)
156{
157  *(__m128d *)__P = __A;
158}
159
160/* Store two DPFP values.  The address need not be 16-byte aligned.  */
161extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm_storeu_pd (double *__P, __m128d __A)
163{
164  __builtin_ia32_storeupd (__P, __A);
165}
166
167/* Stores the lower DPFP value.  */
168extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169_mm_store_sd (double *__P, __m128d __A)
170{
171  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
172}
173
174extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_cvtsd_f64 (__m128d __A)
176{
177  return __builtin_ia32_vec_ext_v2df (__A, 0);
178}
179
180extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181_mm_storel_pd (double *__P, __m128d __A)
182{
183  _mm_store_sd (__P, __A);
184}
185
186/* Stores the upper DPFP value.  */
187extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188_mm_storeh_pd (double *__P, __m128d __A)
189{
190  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
191}
192
193/* Store the lower DPFP value across two words.
194   The address must be 16-byte aligned.  */
195extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_store1_pd (double *__P, __m128d __A)
197{
198  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
199}
200
201extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202_mm_store_pd1 (double *__P, __m128d __A)
203{
204  _mm_store1_pd (__P, __A);
205}
206
207/* Store two DPFP values in reverse order.  The address must be aligned.  */
208extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209_mm_storer_pd (double *__P, __m128d __A)
210{
211  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
212}
213
214extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215_mm_cvtsi128_si32 (__m128i __A)
216{
217  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
218}
219
220#ifdef __x86_64__
221/* Intel intrinsic.  */
222extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_mm_cvtsi128_si64 (__m128i __A)
224{
225  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
226}
227
228/* Microsoft intrinsic.  */
229extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230_mm_cvtsi128_si64x (__m128i __A)
231{
232  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
233}
234#endif
235
236extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_add_pd (__m128d __A, __m128d __B)
238{
239  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
240}
241
242extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243_mm_add_sd (__m128d __A, __m128d __B)
244{
245  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
246}
247
248extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249_mm_sub_pd (__m128d __A, __m128d __B)
250{
251  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
252}
253
254extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255_mm_sub_sd (__m128d __A, __m128d __B)
256{
257  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
258}
259
260extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261_mm_mul_pd (__m128d __A, __m128d __B)
262{
263  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
264}
265
266extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267_mm_mul_sd (__m128d __A, __m128d __B)
268{
269  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
270}
271
272extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
273_mm_div_pd (__m128d __A, __m128d __B)
274{
275  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
276}
277
278extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279_mm_div_sd (__m128d __A, __m128d __B)
280{
281  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
282}
283
284extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285_mm_sqrt_pd (__m128d __A)
286{
287  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
288}
289
290/* Return pair {sqrt (A[0), B[1]}.  */
291extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292_mm_sqrt_sd (__m128d __A, __m128d __B)
293{
294  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
295  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
296}
297
298extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299_mm_min_pd (__m128d __A, __m128d __B)
300{
301  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
302}
303
304extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm_min_sd (__m128d __A, __m128d __B)
306{
307  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
308}
309
310extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm_max_pd (__m128d __A, __m128d __B)
312{
313  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
314}
315
316extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317_mm_max_sd (__m128d __A, __m128d __B)
318{
319  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
320}
321
322extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323_mm_and_pd (__m128d __A, __m128d __B)
324{
325  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
326}
327
328extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329_mm_andnot_pd (__m128d __A, __m128d __B)
330{
331  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
332}
333
334extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335_mm_or_pd (__m128d __A, __m128d __B)
336{
337  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
338}
339
340extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341_mm_xor_pd (__m128d __A, __m128d __B)
342{
343  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
344}
345
346extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347_mm_cmpeq_pd (__m128d __A, __m128d __B)
348{
349  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
350}
351
352extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353_mm_cmplt_pd (__m128d __A, __m128d __B)
354{
355  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
356}
357
358extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359_mm_cmple_pd (__m128d __A, __m128d __B)
360{
361  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
362}
363
364extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365_mm_cmpgt_pd (__m128d __A, __m128d __B)
366{
367  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
368}
369
370extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371_mm_cmpge_pd (__m128d __A, __m128d __B)
372{
373  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
374}
375
376extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377_mm_cmpneq_pd (__m128d __A, __m128d __B)
378{
379  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
380}
381
382extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383_mm_cmpnlt_pd (__m128d __A, __m128d __B)
384{
385  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
386}
387
388extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389_mm_cmpnle_pd (__m128d __A, __m128d __B)
390{
391  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
392}
393
394extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395_mm_cmpngt_pd (__m128d __A, __m128d __B)
396{
397  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
398}
399
400extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401_mm_cmpnge_pd (__m128d __A, __m128d __B)
402{
403  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
404}
405
406extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407_mm_cmpord_pd (__m128d __A, __m128d __B)
408{
409  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
410}
411
412extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_cmpunord_pd (__m128d __A, __m128d __B)
414{
415  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
416}
417
418extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419_mm_cmpeq_sd (__m128d __A, __m128d __B)
420{
421  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
422}
423
424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425_mm_cmplt_sd (__m128d __A, __m128d __B)
426{
427  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
428}
429
430extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
431_mm_cmple_sd (__m128d __A, __m128d __B)
432{
433  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
434}
435
436extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437_mm_cmpgt_sd (__m128d __A, __m128d __B)
438{
439  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
440					 (__v2df)
441					 __builtin_ia32_cmpltsd ((__v2df) __B,
442								 (__v2df)
443								 __A));
444}
445
446extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447_mm_cmpge_sd (__m128d __A, __m128d __B)
448{
449  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
450					 (__v2df)
451					 __builtin_ia32_cmplesd ((__v2df) __B,
452								 (__v2df)
453								 __A));
454}
455
456extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457_mm_cmpneq_sd (__m128d __A, __m128d __B)
458{
459  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
460}
461
462extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463_mm_cmpnlt_sd (__m128d __A, __m128d __B)
464{
465  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
466}
467
468extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469_mm_cmpnle_sd (__m128d __A, __m128d __B)
470{
471  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
472}
473
474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm_cmpngt_sd (__m128d __A, __m128d __B)
476{
477  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
478					 (__v2df)
479					 __builtin_ia32_cmpnltsd ((__v2df) __B,
480								  (__v2df)
481								  __A));
482}
483
484extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485_mm_cmpnge_sd (__m128d __A, __m128d __B)
486{
487  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
488					 (__v2df)
489					 __builtin_ia32_cmpnlesd ((__v2df) __B,
490								  (__v2df)
491								  __A));
492}
493
494extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495_mm_cmpord_sd (__m128d __A, __m128d __B)
496{
497  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
498}
499
500extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501_mm_cmpunord_sd (__m128d __A, __m128d __B)
502{
503  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
504}
505
506extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_comieq_sd (__m128d __A, __m128d __B)
508{
509  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
510}
511
512extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_mm_comilt_sd (__m128d __A, __m128d __B)
514{
515  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
516}
517
518extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519_mm_comile_sd (__m128d __A, __m128d __B)
520{
521  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
522}
523
524extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525_mm_comigt_sd (__m128d __A, __m128d __B)
526{
527  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
528}
529
530extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531_mm_comige_sd (__m128d __A, __m128d __B)
532{
533  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
534}
535
536extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537_mm_comineq_sd (__m128d __A, __m128d __B)
538{
539  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
540}
541
542extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543_mm_ucomieq_sd (__m128d __A, __m128d __B)
544{
545  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
546}
547
548extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549_mm_ucomilt_sd (__m128d __A, __m128d __B)
550{
551  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
552}
553
554extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555_mm_ucomile_sd (__m128d __A, __m128d __B)
556{
557  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
558}
559
560extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561_mm_ucomigt_sd (__m128d __A, __m128d __B)
562{
563  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
564}
565
566extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
567_mm_ucomige_sd (__m128d __A, __m128d __B)
568{
569  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
570}
571
572extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573_mm_ucomineq_sd (__m128d __A, __m128d __B)
574{
575  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
576}
577
578/* Create a vector of Qi, where i is the element number.  */
579
580extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581_mm_set_epi64x (long long __q1, long long __q0)
582{
583  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
584}
585
586extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587_mm_set_epi64 (__m64 __q1,  __m64 __q0)
588{
589  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
590}
591
592extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
594{
595  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
596}
597
598extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
600	       short __q3, short __q2, short __q1, short __q0)
601{
602  return __extension__ (__m128i)(__v8hi){
603    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
604}
605
606extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
608	      char __q11, char __q10, char __q09, char __q08,
609	      char __q07, char __q06, char __q05, char __q04,
610	      char __q03, char __q02, char __q01, char __q00)
611{
612  return __extension__ (__m128i)(__v16qi){
613    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
614    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
615  };
616}
617
618/* Set all of the elements of the vector to A.  */
619
620extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621_mm_set1_epi64x (long long __A)
622{
623  return _mm_set_epi64x (__A, __A);
624}
625
626extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627_mm_set1_epi64 (__m64 __A)
628{
629  return _mm_set_epi64 (__A, __A);
630}
631
632extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633_mm_set1_epi32 (int __A)
634{
635  return _mm_set_epi32 (__A, __A, __A, __A);
636}
637
638extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639_mm_set1_epi16 (short __A)
640{
641  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
642}
643
644extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm_set1_epi8 (char __A)
646{
647  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
648		       __A, __A, __A, __A, __A, __A, __A, __A);
649}
650
651/* Create a vector of Qi, where i is the element number.
652   The parameter order is reversed from the _mm_set_epi* functions.  */
653
654extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655_mm_setr_epi64 (__m64 __q0, __m64 __q1)
656{
657  return _mm_set_epi64 (__q1, __q0);
658}
659
660extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
662{
663  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
664}
665
666extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
668	        short __q4, short __q5, short __q6, short __q7)
669{
670  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
671}
672
673extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
675	       char __q04, char __q05, char __q06, char __q07,
676	       char __q08, char __q09, char __q10, char __q11,
677	       char __q12, char __q13, char __q14, char __q15)
678{
679  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
680		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
681}
682
683/* Create a vector with element 0 as *P and the rest zero.  */
684
685extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686_mm_load_si128 (__m128i const *__P)
687{
688  return *__P;
689}
690
691extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692_mm_loadu_si128 (__m128i const *__P)
693{
694  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
695}
696
697extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698_mm_loadl_epi64 (__m128i const *__P)
699{
700  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
701}
702
703extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704_mm_store_si128 (__m128i *__P, __m128i __B)
705{
706  *__P = __B;
707}
708
709extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
710_mm_storeu_si128 (__m128i *__P, __m128i __B)
711{
712  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
713}
714
715extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716_mm_storel_epi64 (__m128i *__P, __m128i __B)
717{
718  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
719}
720
721extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722_mm_movepi64_pi64 (__m128i __B)
723{
724  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
725}
726
727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728_mm_movpi64_epi64 (__m64 __A)
729{
730  return _mm_set_epi64 ((__m64)0LL, __A);
731}
732
733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_move_epi64 (__m128i __A)
735{
736  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
737}
738
739/* Create an undefined vector.  */
740extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741_mm_undefined_si128 (void)
742{
743  __m128i __Y = __Y;
744  return __Y;
745}
746
747/* Create a vector of zeros.  */
748extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749_mm_setzero_si128 (void)
750{
751  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
752}
753
754extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755_mm_cvtepi32_pd (__m128i __A)
756{
757  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
758}
759
760extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761_mm_cvtepi32_ps (__m128i __A)
762{
763  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
764}
765
766extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767_mm_cvtpd_epi32 (__m128d __A)
768{
769  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
770}
771
772extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773_mm_cvtpd_pi32 (__m128d __A)
774{
775  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
776}
777
778extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779_mm_cvtpd_ps (__m128d __A)
780{
781  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
782}
783
784extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785_mm_cvttpd_epi32 (__m128d __A)
786{
787  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
788}
789
790extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791_mm_cvttpd_pi32 (__m128d __A)
792{
793  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
794}
795
796extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797_mm_cvtpi32_pd (__m64 __A)
798{
799  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
800}
801
802extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803_mm_cvtps_epi32 (__m128 __A)
804{
805  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
806}
807
808extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809_mm_cvttps_epi32 (__m128 __A)
810{
811  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
812}
813
814extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815_mm_cvtps_pd (__m128 __A)
816{
817  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
818}
819
820extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821_mm_cvtsd_si32 (__m128d __A)
822{
823  return __builtin_ia32_cvtsd2si ((__v2df) __A);
824}
825
826#ifdef __x86_64__
827/* Intel intrinsic.  */
828extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829_mm_cvtsd_si64 (__m128d __A)
830{
831  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
832}
833
834/* Microsoft intrinsic.  */
835extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm_cvtsd_si64x (__m128d __A)
837{
838  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
839}
840#endif
841
842extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843_mm_cvttsd_si32 (__m128d __A)
844{
845  return __builtin_ia32_cvttsd2si ((__v2df) __A);
846}
847
848#ifdef __x86_64__
849/* Intel intrinsic.  */
850extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851_mm_cvttsd_si64 (__m128d __A)
852{
853  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
854}
855
856/* Microsoft intrinsic.  */
857extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858_mm_cvttsd_si64x (__m128d __A)
859{
860  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
861}
862#endif
863
864extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865_mm_cvtsd_ss (__m128 __A, __m128d __B)
866{
867  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
868}
869
870extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871_mm_cvtsi32_sd (__m128d __A, int __B)
872{
873  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
874}
875
876#ifdef __x86_64__
877/* Intel intrinsic.  */
878extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879_mm_cvtsi64_sd (__m128d __A, long long __B)
880{
881  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
882}
883
884/* Microsoft intrinsic.  */
885extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886_mm_cvtsi64x_sd (__m128d __A, long long __B)
887{
888  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
889}
890#endif
891
892extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893_mm_cvtss_sd (__m128d __A, __m128 __B)
894{
895  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
896}
897
898#ifdef __OPTIMIZE__
899extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
901{
902  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
903}
904#else
905#define _mm_shuffle_pd(A, B, N)						\
906  ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),		\
907				   (__v2df)(__m128d)(B), (int)(N)))
908#endif
909
910extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911_mm_unpackhi_pd (__m128d __A, __m128d __B)
912{
913  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
914}
915
916extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
917_mm_unpacklo_pd (__m128d __A, __m128d __B)
918{
919  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
920}
921
922extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923_mm_loadh_pd (__m128d __A, double const *__B)
924{
925  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
926}
927
928extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929_mm_loadl_pd (__m128d __A, double const *__B)
930{
931  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
932}
933
934extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935_mm_movemask_pd (__m128d __A)
936{
937  return __builtin_ia32_movmskpd ((__v2df)__A);
938}
939
940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941_mm_packs_epi16 (__m128i __A, __m128i __B)
942{
943  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
944}
945
946extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947_mm_packs_epi32 (__m128i __A, __m128i __B)
948{
949  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
950}
951
952extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953_mm_packus_epi16 (__m128i __A, __m128i __B)
954{
955  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
956}
957
958extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
960{
961  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
962}
963
964extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
966{
967  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
968}
969
970extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
972{
973  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
974}
975
976extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
978{
979  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
980}
981
982extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
984{
985  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
986}
987
988extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
990{
991  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
992}
993
994extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
996{
997  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
998}
999
1000extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1001_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1002{
1003  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
1004}
1005
1006extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_add_epi8 (__m128i __A, __m128i __B)
1008{
1009  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
1010}
1011
1012extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013_mm_add_epi16 (__m128i __A, __m128i __B)
1014{
1015  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
1016}
1017
1018extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019_mm_add_epi32 (__m128i __A, __m128i __B)
1020{
1021  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
1022}
1023
1024extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025_mm_add_epi64 (__m128i __A, __m128i __B)
1026{
1027  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1028}
1029
1030extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031_mm_adds_epi8 (__m128i __A, __m128i __B)
1032{
1033  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1034}
1035
1036extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1037_mm_adds_epi16 (__m128i __A, __m128i __B)
1038{
1039  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1040}
1041
1042extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043_mm_adds_epu8 (__m128i __A, __m128i __B)
1044{
1045  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1046}
1047
1048extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1049_mm_adds_epu16 (__m128i __A, __m128i __B)
1050{
1051  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1052}
1053
1054extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055_mm_sub_epi8 (__m128i __A, __m128i __B)
1056{
1057  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1058}
1059
1060extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061_mm_sub_epi16 (__m128i __A, __m128i __B)
1062{
1063  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1064}
1065
1066extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067_mm_sub_epi32 (__m128i __A, __m128i __B)
1068{
1069  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1070}
1071
1072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073_mm_sub_epi64 (__m128i __A, __m128i __B)
1074{
1075  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1076}
1077
1078extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079_mm_subs_epi8 (__m128i __A, __m128i __B)
1080{
1081  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1082}
1083
1084extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085_mm_subs_epi16 (__m128i __A, __m128i __B)
1086{
1087  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1088}
1089
1090extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1091_mm_subs_epu8 (__m128i __A, __m128i __B)
1092{
1093  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1094}
1095
1096extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097_mm_subs_epu16 (__m128i __A, __m128i __B)
1098{
1099  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1100}
1101
1102extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103_mm_madd_epi16 (__m128i __A, __m128i __B)
1104{
1105  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1106}
1107
1108extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1109_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1110{
1111  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1112}
1113
1114extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115_mm_mullo_epi16 (__m128i __A, __m128i __B)
1116{
1117  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1118}
1119
1120extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121_mm_mul_su32 (__m64 __A, __m64 __B)
1122{
1123  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1124}
1125
1126extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127_mm_mul_epu32 (__m128i __A, __m128i __B)
1128{
1129  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1130}
1131
1132extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133_mm_slli_epi16 (__m128i __A, int __B)
1134{
1135  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1136}
1137
1138extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139_mm_slli_epi32 (__m128i __A, int __B)
1140{
1141  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1142}
1143
1144extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145_mm_slli_epi64 (__m128i __A, int __B)
1146{
1147  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1148}
1149
1150extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151_mm_srai_epi16 (__m128i __A, int __B)
1152{
1153  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1154}
1155
1156extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157_mm_srai_epi32 (__m128i __A, int __B)
1158{
1159  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1160}
1161
1162#ifdef __OPTIMIZE__
1163extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164_mm_srli_si128 (__m128i __A, const int __N)
1165{
1166  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
1167}
1168
1169extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1170_mm_slli_si128 (__m128i __A, const int __N)
1171{
1172  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
1173}
1174#else
1175#define _mm_srli_si128(A, N) \
1176  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
1177#define _mm_slli_si128(A, N) \
1178  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
1179#endif
1180
1181extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182_mm_srli_epi16 (__m128i __A, int __B)
1183{
1184  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1185}
1186
1187extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188_mm_srli_epi32 (__m128i __A, int __B)
1189{
1190  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1191}
1192
1193extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194_mm_srli_epi64 (__m128i __A, int __B)
1195{
1196  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1197}
1198
1199extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200_mm_sll_epi16 (__m128i __A, __m128i __B)
1201{
1202  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1203}
1204
1205extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm_sll_epi32 (__m128i __A, __m128i __B)
1207{
1208  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1209}
1210
1211extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212_mm_sll_epi64 (__m128i __A, __m128i __B)
1213{
1214  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1215}
1216
1217extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218_mm_sra_epi16 (__m128i __A, __m128i __B)
1219{
1220  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1221}
1222
1223extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1224_mm_sra_epi32 (__m128i __A, __m128i __B)
1225{
1226  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1227}
1228
1229extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230_mm_srl_epi16 (__m128i __A, __m128i __B)
1231{
1232  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1233}
1234
1235extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236_mm_srl_epi32 (__m128i __A, __m128i __B)
1237{
1238  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1239}
1240
1241extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242_mm_srl_epi64 (__m128i __A, __m128i __B)
1243{
1244  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1245}
1246
1247extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248_mm_and_si128 (__m128i __A, __m128i __B)
1249{
1250  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1251}
1252
1253extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254_mm_andnot_si128 (__m128i __A, __m128i __B)
1255{
1256  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1257}
1258
1259extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260_mm_or_si128 (__m128i __A, __m128i __B)
1261{
1262  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1263}
1264
1265extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266_mm_xor_si128 (__m128i __A, __m128i __B)
1267{
1268  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1269}
1270
1271extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1273{
1274  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1275}
1276
1277extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1279{
1280  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1281}
1282
1283extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1285{
1286  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1287}
1288
1289extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1291{
1292  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1293}
1294
1295extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1297{
1298  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1299}
1300
1301extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1302_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1303{
1304  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1305}
1306
1307extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1308_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1309{
1310  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1311}
1312
1313extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1314_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1315{
1316  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1317}
1318
1319extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1321{
1322  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1323}
1324
1325#ifdef __OPTIMIZE__
1326extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327_mm_extract_epi16 (__m128i const __A, int const __N)
1328{
1329  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1330}
1331
1332extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1334{
1335  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1336}
1337#else
1338#define _mm_extract_epi16(A, N) \
1339  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
1340#define _mm_insert_epi16(A, D, N)				\
1341  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
1342					  (int)(D), (int)(N)))
1343#endif
1344
1345extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346_mm_max_epi16 (__m128i __A, __m128i __B)
1347{
1348  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1349}
1350
1351extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352_mm_max_epu8 (__m128i __A, __m128i __B)
1353{
1354  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1355}
1356
1357extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358_mm_min_epi16 (__m128i __A, __m128i __B)
1359{
1360  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1361}
1362
1363extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364_mm_min_epu8 (__m128i __A, __m128i __B)
1365{
1366  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1367}
1368
1369extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370_mm_movemask_epi8 (__m128i __A)
1371{
1372  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1373}
1374
1375extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1377{
1378  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1379}
1380
1381#ifdef __OPTIMIZE__
1382extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm_shufflehi_epi16 (__m128i __A, const int __mask)
1384{
1385  return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
1386}
1387
1388extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389_mm_shufflelo_epi16 (__m128i __A, const int __mask)
1390{
1391  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
1392}
1393
1394extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395_mm_shuffle_epi32 (__m128i __A, const int __mask)
1396{
1397  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
1398}
1399#else
1400#define _mm_shufflehi_epi16(A, N) \
1401  ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
1402#define _mm_shufflelo_epi16(A, N) \
1403  ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
1404#define _mm_shuffle_epi32(A, N) \
1405  ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
1406#endif
1407
1408extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1410{
1411  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1412}
1413
1414extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415_mm_avg_epu8 (__m128i __A, __m128i __B)
1416{
1417  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1418}
1419
1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421_mm_avg_epu16 (__m128i __A, __m128i __B)
1422{
1423  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1424}
1425
1426extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427_mm_sad_epu8 (__m128i __A, __m128i __B)
1428{
1429  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1430}
1431
1432extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433_mm_stream_si32 (int *__A, int __B)
1434{
1435  __builtin_ia32_movnti (__A, __B);
1436}
1437
1438#ifdef __x86_64__
1439extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440_mm_stream_si64 (long long int *__A, long long int __B)
1441{
1442  __builtin_ia32_movnti64 (__A, __B);
1443}
1444#endif
1445
1446extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1447_mm_stream_si128 (__m128i *__A, __m128i __B)
1448{
1449  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1450}
1451
1452extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1453_mm_stream_pd (double *__A, __m128d __B)
1454{
1455  __builtin_ia32_movntpd (__A, (__v2df)__B);
1456}
1457
1458extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459_mm_clflush (void const *__A)
1460{
1461  __builtin_ia32_clflush (__A);
1462}
1463
1464extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465_mm_lfence (void)
1466{
1467  __builtin_ia32_lfence ();
1468}
1469
1470extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471_mm_mfence (void)
1472{
1473  __builtin_ia32_mfence ();
1474}
1475
1476extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477_mm_cvtsi32_si128 (int __A)
1478{
1479  return _mm_set_epi32 (0, 0, 0, __A);
1480}
1481
1482#ifdef __x86_64__
1483/* Intel intrinsic.  */
1484extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1485_mm_cvtsi64_si128 (long long __A)
1486{
1487  return _mm_set_epi64x (0, __A);
1488}
1489
1490/* Microsoft intrinsic.  */
1491extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1492_mm_cvtsi64x_si128 (long long __A)
1493{
1494  return _mm_set_epi64x (0, __A);
1495}
1496#endif
1497
1498/* Casts between various SP, DP, INT vector types.  Note that these do no
1499   conversion of values, they just change the type.  */
1500extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1501_mm_castpd_ps(__m128d __A)
1502{
1503  return (__m128) __A;
1504}
1505
1506extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1507_mm_castpd_si128(__m128d __A)
1508{
1509  return (__m128i) __A;
1510}
1511
1512extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513_mm_castps_pd(__m128 __A)
1514{
1515  return (__m128d) __A;
1516}
1517
1518extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519_mm_castps_si128(__m128 __A)
1520{
1521  return (__m128i) __A;
1522}
1523
1524extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1525_mm_castsi128_ps(__m128i __A)
1526{
1527  return (__m128) __A;
1528}
1529
1530extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1531_mm_castsi128_pd(__m128i __A)
1532{
1533  return (__m128d) __A;
1534}
1535
1536#ifdef __DISABLE_SSE2__
1537#undef __DISABLE_SSE2__
1538#pragma GCC pop_options
1539#endif /* __DISABLE_SSE2__ */
1540
1541#endif /* _EMMINTRIN_H_INCLUDED */
1542