1/* Copyright (C) 2008-2013 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 11.0.  */
26
27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
30
31/* Internal data types for implementing the intrinsics.  */
32typedef double __v4df __attribute__ ((__vector_size__ (32)));
33typedef float __v8sf __attribute__ ((__vector_size__ (32)));
34typedef long long __v4di __attribute__ ((__vector_size__ (32)));
35typedef int __v8si __attribute__ ((__vector_size__ (32)));
36typedef short __v16hi __attribute__ ((__vector_size__ (32)));
37typedef char __v32qi __attribute__ ((__vector_size__ (32)));
38
39/* The Intel API is flexible enough that we must allow aliasing with other
40   vector types, and their scalar components.  */
41typedef float __m256 __attribute__ ((__vector_size__ (32),
42				     __may_alias__));
43typedef long long __m256i __attribute__ ((__vector_size__ (32),
44					  __may_alias__));
45typedef double __m256d __attribute__ ((__vector_size__ (32),
46				       __may_alias__));
47
48/* Compare predicates for scalar and packed compare intrinsics.  */
49
50/* Equal (ordered, non-signaling)  */
51#define _CMP_EQ_OQ	0x00
52/* Less-than (ordered, signaling)  */
53#define _CMP_LT_OS	0x01
54/* Less-than-or-equal (ordered, signaling)  */
55#define _CMP_LE_OS	0x02
56/* Unordered (non-signaling)  */
57#define _CMP_UNORD_Q	0x03
58/* Not-equal (unordered, non-signaling)  */
59#define _CMP_NEQ_UQ	0x04
60/* Not-less-than (unordered, signaling)  */
61#define _CMP_NLT_US	0x05
62/* Not-less-than-or-equal (unordered, signaling)  */
63#define _CMP_NLE_US	0x06
64/* Ordered (nonsignaling)   */
65#define _CMP_ORD_Q	0x07
66/* Equal (unordered, non-signaling)  */
67#define _CMP_EQ_UQ	0x08
68/* Not-greater-than-or-equal (unordered, signaling)  */
69#define _CMP_NGE_US	0x09
70/* Not-greater-than (unordered, signaling)  */
71#define _CMP_NGT_US	0x0a
72/* False (ordered, non-signaling)  */
73#define _CMP_FALSE_OQ	0x0b
74/* Not-equal (ordered, non-signaling)  */
75#define _CMP_NEQ_OQ	0x0c
76/* Greater-than-or-equal (ordered, signaling)  */
77#define _CMP_GE_OS	0x0d
78/* Greater-than (ordered, signaling)  */
79#define _CMP_GT_OS	0x0e
80/* True (unordered, non-signaling)  */
81#define _CMP_TRUE_UQ	0x0f
82/* Equal (ordered, signaling)  */
83#define _CMP_EQ_OS	0x10
84/* Less-than (ordered, non-signaling)  */
85#define _CMP_LT_OQ	0x11
86/* Less-than-or-equal (ordered, non-signaling)  */
87#define _CMP_LE_OQ	0x12
88/* Unordered (signaling)  */
89#define _CMP_UNORD_S	0x13
90/* Not-equal (unordered, signaling)  */
91#define _CMP_NEQ_US	0x14
92/* Not-less-than (unordered, non-signaling)  */
93#define _CMP_NLT_UQ	0x15
94/* Not-less-than-or-equal (unordered, non-signaling)  */
95#define _CMP_NLE_UQ	0x16
96/* Ordered (signaling)  */
97#define _CMP_ORD_S	0x17
98/* Equal (unordered, signaling)  */
99#define _CMP_EQ_US	0x18
100/* Not-greater-than-or-equal (unordered, non-signaling)  */
101#define _CMP_NGE_UQ	0x19
102/* Not-greater-than (unordered, non-signaling)  */
103#define _CMP_NGT_UQ	0x1a
104/* False (ordered, signaling)  */
105#define _CMP_FALSE_OS	0x1b
106/* Not-equal (ordered, signaling)  */
107#define _CMP_NEQ_OS	0x1c
108/* Greater-than-or-equal (ordered, non-signaling)  */
109#define _CMP_GE_OQ	0x1d
110/* Greater-than (ordered, non-signaling)  */
111#define _CMP_GT_OQ	0x1e
112/* True (unordered, signaling)  */
113#define _CMP_TRUE_US	0x1f
114
115extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116_mm256_add_pd (__m256d __A, __m256d __B)
117{
118  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
119}
120
121extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122_mm256_add_ps (__m256 __A, __m256 __B)
123{
124  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
125}
126
127extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128_mm256_addsub_pd (__m256d __A, __m256d __B)
129{
130  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
131}
132
133extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134_mm256_addsub_ps (__m256 __A, __m256 __B)
135{
136  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
137}
138
139
140extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141_mm256_and_pd (__m256d __A, __m256d __B)
142{
143  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
144}
145
146extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm256_and_ps (__m256 __A, __m256 __B)
148{
149  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
150}
151
152extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm256_andnot_pd (__m256d __A, __m256d __B)
154{
155  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
156}
157
158extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm256_andnot_ps (__m256 __A, __m256 __B)
160{
161  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
162}
163
164/* Double/single precision floating point blend instructions - select
165   data from 2 sources using constant/variable mask.  */
166
167#ifdef __OPTIMIZE__
168extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
170{
171  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
172					      (__v4df)__Y,
173					      __M);
174}
175
176extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
178{
179  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
180					     (__v8sf)__Y,
181					     __M);
182}
183#else
184#define _mm256_blend_pd(X, Y, M)					\
185  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
186					(__v4df)(__m256d)(Y), (int)(M)))
187
188#define _mm256_blend_ps(X, Y, M)					\
189  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
190				       (__v8sf)(__m256)(Y), (int)(M)))
191#endif
192
193extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
195{
196  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
197					       (__v4df)__Y,
198					       (__v4df)__M);
199}
200
201extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
203{
204  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
205					      (__v8sf)__Y,
206					      (__v8sf)__M);
207}
208
209extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210_mm256_div_pd (__m256d __A, __m256d __B)
211{
212  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
213}
214
215extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216_mm256_div_ps (__m256 __A, __m256 __B)
217{
218  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
219}
220
221/* Dot product instructions with mask-defined summing and zeroing parts
222   of result.  */
223
224#ifdef __OPTIMIZE__
225extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
227{
228  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
229					  (__v8sf)__Y,
230					  __M);
231}
232#else
233#define _mm256_dp_ps(X, Y, M)						\
234  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
235				    (__v8sf)(__m256)(Y), (int)(M)))
236#endif
237
238extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239_mm256_hadd_pd (__m256d __X, __m256d __Y)
240{
241  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
242}
243
244extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245_mm256_hadd_ps (__m256 __X, __m256 __Y)
246{
247  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
248}
249
250extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251_mm256_hsub_pd (__m256d __X, __m256d __Y)
252{
253  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
254}
255
256extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257_mm256_hsub_ps (__m256 __X, __m256 __Y)
258{
259  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
260}
261
262extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm256_max_pd (__m256d __A, __m256d __B)
264{
265  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
266}
267
268extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269_mm256_max_ps (__m256 __A, __m256 __B)
270{
271  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
272}
273
274extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275_mm256_min_pd (__m256d __A, __m256d __B)
276{
277  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
278}
279
280extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281_mm256_min_ps (__m256 __A, __m256 __B)
282{
283  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
284}
285
286extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287_mm256_mul_pd (__m256d __A, __m256d __B)
288{
289  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
290}
291
292extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293_mm256_mul_ps (__m256 __A, __m256 __B)
294{
295  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
296}
297
298extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299_mm256_or_pd (__m256d __A, __m256d __B)
300{
301  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
302}
303
304extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm256_or_ps (__m256 __A, __m256 __B)
306{
307  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
308}
309
310#ifdef __OPTIMIZE__
311extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
313{
314  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
315					     __mask);
316}
317
318extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
320{
321  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
322					    __mask);
323}
324#else
325#define _mm256_shuffle_pd(A, B, N)					\
326  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
327				      (__v4df)(__m256d)(B), (int)(N)))
328
329#define _mm256_shuffle_ps(A, B, N)					\
330  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
331				      (__v8sf)(__m256)(B), (int)(N)))
332#endif
333
334extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335_mm256_sub_pd (__m256d __A, __m256d __B)
336{
337  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
338}
339
340extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341_mm256_sub_ps (__m256 __A, __m256 __B)
342{
343  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
344}
345
346extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347_mm256_xor_pd (__m256d __A, __m256d __B)
348{
349  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
350}
351
352extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353_mm256_xor_ps (__m256 __A, __m256 __B)
354{
355  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
356}
357
358#ifdef __OPTIMIZE__
359extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
361{
362  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
363}
364
365extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
367{
368  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
369}
370
371extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
373{
374  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
375					    __P);
376}
377
378extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
380{
381  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
382					   __P);
383}
384
385extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
387{
388  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
389}
390
391extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
393{
394  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
395}
396#else
397#define _mm_cmp_pd(X, Y, P)						\
398  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
399				   (__v2df)(__m128d)(Y), (int)(P)))
400
401#define _mm_cmp_ps(X, Y, P)						\
402  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
403				  (__v4sf)(__m128)(Y), (int)(P)))
404
405#define _mm256_cmp_pd(X, Y, P)						\
406  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
407				      (__v4df)(__m256d)(Y), (int)(P)))
408
409#define _mm256_cmp_ps(X, Y, P)						\
410  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
411				     (__v8sf)(__m256)(Y), (int)(P)))
412
413#define _mm_cmp_sd(X, Y, P)						\
414  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
415				   (__v2df)(__m128d)(Y), (int)(P)))
416
417#define _mm_cmp_ss(X, Y, P)						\
418  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
419				  (__v4sf)(__m128)(Y), (int)(P)))
420#endif
421
422extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423_mm256_cvtepi32_pd (__m128i __A)
424{
425  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
426}
427
428extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429_mm256_cvtepi32_ps (__m256i __A)
430{
431  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
432}
433
434extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435_mm256_cvtpd_ps (__m256d __A)
436{
437  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
438}
439
440extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm256_cvtps_epi32 (__m256 __A)
442{
443  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
444}
445
446extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447_mm256_cvtps_pd (__m128 __A)
448{
449  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
450}
451
452extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453_mm256_cvttpd_epi32 (__m256d __A)
454{
455  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
456}
457
458extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459_mm256_cvtpd_epi32 (__m256d __A)
460{
461  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
462}
463
464extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm256_cvttps_epi32 (__m256 __A)
466{
467  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
468}
469
470#ifdef __OPTIMIZE__
471extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472_mm256_extractf128_pd (__m256d __X, const int __N)
473{
474  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
475}
476
477extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm256_extractf128_ps (__m256 __X, const int __N)
479{
480  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
481}
482
483extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484_mm256_extractf128_si256 (__m256i __X, const int __N)
485{
486  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
487}
488
489extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490_mm256_extract_epi32 (__m256i __X, int const __N)
491{
492  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
493  return _mm_extract_epi32 (__Y, __N % 4);
494}
495
496extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_mm256_extract_epi16 (__m256i __X, int const __N)
498{
499  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
500  return _mm_extract_epi16 (__Y, __N % 8);
501}
502
503extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504_mm256_extract_epi8 (__m256i __X, int const __N)
505{
506  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
507  return _mm_extract_epi8 (__Y, __N % 16);
508}
509
510#ifdef __x86_64__
511extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512_mm256_extract_epi64 (__m256i __X, const int __N)
513{
514  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
515  return _mm_extract_epi64 (__Y, __N % 2);
516}
517#endif
518#else
519#define _mm256_extractf128_pd(X, N)					\
520  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
521						(int)(N)))
522
523#define _mm256_extractf128_ps(X, N)					\
524  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
525					       (int)(N)))
526
527#define _mm256_extractf128_si256(X, N)					\
528  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
529						(int)(N)))
530
531#define _mm256_extract_epi32(X, N)					\
532  (__extension__							\
533   ({									\
534      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
535      _mm_extract_epi32 (__Y, (N) % 4);					\
536    }))
537
538#define _mm256_extract_epi16(X, N)					\
539  (__extension__							\
540   ({									\
541      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
542      _mm_extract_epi16 (__Y, (N) % 8);					\
543    }))
544
545#define _mm256_extract_epi8(X, N)					\
546  (__extension__							\
547   ({									\
548      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
549      _mm_extract_epi8 (__Y, (N) % 16);					\
550    }))
551
552#ifdef __x86_64__
553#define _mm256_extract_epi64(X, N)					\
554  (__extension__							\
555   ({									\
556      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
557      _mm_extract_epi64 (__Y, (N) % 2);					\
558    }))
559#endif
560#endif
561
562extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563_mm256_zeroall (void)
564{
565  __builtin_ia32_vzeroall ();
566}
567
568extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569_mm256_zeroupper (void)
570{
571  __builtin_ia32_vzeroupper ();
572}
573
574extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575_mm_permutevar_pd (__m128d __A, __m128i __C)
576{
577  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
578						(__v2di)__C);
579}
580
581extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582_mm256_permutevar_pd (__m256d __A, __m256i __C)
583{
584  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
585						   (__v4di)__C);
586}
587
588extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589_mm_permutevar_ps (__m128 __A, __m128i __C)
590{
591  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
592					       (__v4si)__C);
593}
594
595extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596_mm256_permutevar_ps (__m256 __A, __m256i __C)
597{
598  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
599						  (__v8si)__C);
600}
601
602#ifdef __OPTIMIZE__
603extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604_mm_permute_pd (__m128d __X, const int __C)
605{
606  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
607}
608
609extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610_mm256_permute_pd (__m256d __X, const int __C)
611{
612  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
613}
614
615extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616_mm_permute_ps (__m128 __X, const int __C)
617{
618  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
619}
620
621extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622_mm256_permute_ps (__m256 __X, const int __C)
623{
624  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
625}
626#else
627#define _mm_permute_pd(X, C)						\
628  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
629
630#define _mm256_permute_pd(X, C)						\
631  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
632
633#define _mm_permute_ps(X, C)						\
634  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
635
636#define _mm256_permute_ps(X, C)						\
637  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
638#endif
639
640#ifdef __OPTIMIZE__
641extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
643{
644  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
645						    (__v4df)__Y,
646						    __C);
647}
648
649extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
651{
652  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
653						   (__v8sf)__Y,
654						   __C);
655}
656
657extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
659{
660  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
661						    (__v8si)__Y,
662						    __C);
663}
664#else
665#define _mm256_permute2f128_pd(X, Y, C)					\
666  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
667					      (__v4df)(__m256d)(Y),	\
668					      (int)(C)))
669
670#define _mm256_permute2f128_ps(X, Y, C)					\
671  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
672					     (__v8sf)(__m256)(Y),	\
673					     (int)(C)))
674
675#define _mm256_permute2f128_si256(X, Y, C)				\
676  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
677					      (__v8si)(__m256i)(Y),	\
678					      (int)(C)))
679#endif
680
681extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682_mm_broadcast_ss (float const *__X)
683{
684  return (__m128) __builtin_ia32_vbroadcastss (__X);
685}
686
687extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688_mm256_broadcast_sd (double const *__X)
689{
690  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
691}
692
693extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694_mm256_broadcast_ss (float const *__X)
695{
696  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
697}
698
699extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700_mm256_broadcast_pd (__m128d const *__X)
701{
702  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
703}
704
705extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706_mm256_broadcast_ps (__m128 const *__X)
707{
708  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
709}
710
711#ifdef __OPTIMIZE__
712extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
714{
715  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
716						     (__v2df)__Y,
717						     __O);
718}
719
720extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
722{
723  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
724						    (__v4sf)__Y,
725						    __O);
726}
727
728extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
730{
731  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
732						     (__v4si)__Y,
733						     __O);
734}
735
736extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
738{
739  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
740  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
741  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
742}
743
744extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
746{
747  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
748  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
749  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
750}
751
752extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
754{
755  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
756  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
757  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
758}
759
760#ifdef __x86_64__
761extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
763{
764  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
765  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
766  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
767}
768#endif
769#else
770#define _mm256_insertf128_pd(X, Y, O)					\
771  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
772					       (__v2df)(__m128d)(Y),	\
773					       (int)(O)))
774
775#define _mm256_insertf128_ps(X, Y, O)					\
776  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
777					      (__v4sf)(__m128)(Y),  	\
778					      (int)(O)))
779
780#define _mm256_insertf128_si256(X, Y, O)				\
781  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
782					       (__v4si)(__m128i)(Y),	\
783					       (int)(O)))
784
785#define _mm256_insert_epi32(X, D, N)					\
786  (__extension__							\
787   ({									\
788      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
789      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
790      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
791    }))
792
793#define _mm256_insert_epi16(X, D, N)					\
794  (__extension__							\
795   ({									\
796      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
797      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
798      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
799    }))
800
801#define _mm256_insert_epi8(X, D, N)					\
802  (__extension__							\
803   ({									\
804      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
805      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
806      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
807    }))
808
809#ifdef __x86_64__
810#define _mm256_insert_epi64(X, D, N)					\
811  (__extension__							\
812   ({									\
813      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
814      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
815      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
816    }))
817#endif
818#endif
819
820extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821_mm256_load_pd (double const *__P)
822{
823  return *(__m256d *)__P;
824}
825
826extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827_mm256_store_pd (double *__P, __m256d __A)
828{
829  *(__m256d *)__P = __A;
830}
831
832extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833_mm256_load_ps (float const *__P)
834{
835  return *(__m256 *)__P;
836}
837
838extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839_mm256_store_ps (float *__P, __m256 __A)
840{
841  *(__m256 *)__P = __A;
842}
843
844extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845_mm256_loadu_pd (double const *__P)
846{
847  return (__m256d) __builtin_ia32_loadupd256 (__P);
848}
849
850extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851_mm256_storeu_pd (double *__P, __m256d __A)
852{
853  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
854}
855
856extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857_mm256_loadu_ps (float const *__P)
858{
859  return (__m256) __builtin_ia32_loadups256 (__P);
860}
861
862extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863_mm256_storeu_ps (float *__P, __m256 __A)
864{
865  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
866}
867
868extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869_mm256_load_si256 (__m256i const *__P)
870{
871  return *__P;
872}
873
874extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875_mm256_store_si256 (__m256i *__P, __m256i __A)
876{
877  *__P = __A;
878}
879
880extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881_mm256_loadu_si256 (__m256i const *__P)
882{
883  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
884}
885
886extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887_mm256_storeu_si256 (__m256i *__P, __m256i __A)
888{
889  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
890}
891
892extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893_mm_maskload_pd (double const *__P, __m128i __M)
894{
895  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
896					      (__v2di)__M);
897}
898
899extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
901{
902  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
903}
904
905extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm256_maskload_pd (double const *__P, __m256i __M)
907{
908  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
909						 (__v4di)__M);
910}
911
912extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
914{
915  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
916}
917
918extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919_mm_maskload_ps (float const *__P, __m128i __M)
920{
921  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
922					     (__v4si)__M);
923}
924
925extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
927{
928  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
929}
930
931extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932_mm256_maskload_ps (float const *__P, __m256i __M)
933{
934  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
935						(__v8si)__M);
936}
937
938extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
940{
941  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
942}
943
944extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945_mm256_movehdup_ps (__m256 __X)
946{
947  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
948}
949
950extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951_mm256_moveldup_ps (__m256 __X)
952{
953  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
954}
955
956extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957_mm256_movedup_pd (__m256d __X)
958{
959  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
960}
961
962extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963_mm256_lddqu_si256 (__m256i const *__P)
964{
965  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
966}
967
968extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969_mm256_stream_si256 (__m256i *__A, __m256i __B)
970{
971  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
972}
973
974extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975_mm256_stream_pd (double *__A, __m256d __B)
976{
977  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
978}
979
980extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981_mm256_stream_ps (float *__P, __m256 __A)
982{
983  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
984}
985
986extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987_mm256_rcp_ps (__m256 __A)
988{
989  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
990}
991
992extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993_mm256_rsqrt_ps (__m256 __A)
994{
995  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
996}
997
998extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999_mm256_sqrt_pd (__m256d __A)
1000{
1001  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1002}
1003
1004extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005_mm256_sqrt_ps (__m256 __A)
1006{
1007  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1008}
1009
1010#ifdef __OPTIMIZE__
1011extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012_mm256_round_pd (__m256d __V, const int __M)
1013{
1014  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1015}
1016
1017extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018_mm256_round_ps (__m256 __V, const int __M)
1019{
1020  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1021}
1022#else
1023#define _mm256_round_pd(V, M) \
1024  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1025
1026#define _mm256_round_ps(V, M) \
1027  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1028#endif
1029
1030#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
1031#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
1032#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
1033#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
1034
1035extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1037{
1038  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1039}
1040
1041extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1043{
1044  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1045}
1046
1047extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1049{
1050  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1051}
1052
1053extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1055{
1056  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1057}
1058
1059extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060_mm_testz_pd (__m128d __M, __m128d __V)
1061{
1062  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1063}
1064
1065extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066_mm_testc_pd (__m128d __M, __m128d __V)
1067{
1068  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1069}
1070
1071extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072_mm_testnzc_pd (__m128d __M, __m128d __V)
1073{
1074  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1075}
1076
1077extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078_mm_testz_ps (__m128 __M, __m128 __V)
1079{
1080  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1081}
1082
1083extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084_mm_testc_ps (__m128 __M, __m128 __V)
1085{
1086  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1087}
1088
1089extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090_mm_testnzc_ps (__m128 __M, __m128 __V)
1091{
1092  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1093}
1094
1095extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096_mm256_testz_pd (__m256d __M, __m256d __V)
1097{
1098  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1099}
1100
1101extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102_mm256_testc_pd (__m256d __M, __m256d __V)
1103{
1104  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1105}
1106
1107extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108_mm256_testnzc_pd (__m256d __M, __m256d __V)
1109{
1110  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1111}
1112
1113extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114_mm256_testz_ps (__m256 __M, __m256 __V)
1115{
1116  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1117}
1118
1119extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120_mm256_testc_ps (__m256 __M, __m256 __V)
1121{
1122  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1123}
1124
1125extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126_mm256_testnzc_ps (__m256 __M, __m256 __V)
1127{
1128  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1129}
1130
1131extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132_mm256_testz_si256 (__m256i __M, __m256i __V)
1133{
1134  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1135}
1136
1137extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138_mm256_testc_si256 (__m256i __M, __m256i __V)
1139{
1140  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1141}
1142
1143extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1145{
1146  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1147}
1148
1149extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150_mm256_movemask_pd (__m256d __A)
1151{
1152  return __builtin_ia32_movmskpd256 ((__v4df)__A);
1153}
1154
1155extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156_mm256_movemask_ps (__m256 __A)
1157{
1158  return __builtin_ia32_movmskps256 ((__v8sf)__A);
1159}
1160
1161extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162_mm256_setzero_pd (void)
1163{
1164  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1165}
1166
1167extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168_mm256_setzero_ps (void)
1169{
1170  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1171				 0.0, 0.0, 0.0, 0.0 };
1172}
1173
1174extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175_mm256_setzero_si256 (void)
1176{
1177  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1178}
1179
1180/* Create the vector [A B C D].  */
1181extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182_mm256_set_pd (double __A, double __B, double __C, double __D)
1183{
1184  return __extension__ (__m256d){ __D, __C, __B, __A };
1185}
1186
1187/* Create the vector [A B C D E F G H].  */
1188extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189_mm256_set_ps (float __A, float __B, float __C, float __D,
1190	       float __E, float __F, float __G, float __H)
1191{
1192  return __extension__ (__m256){ __H, __G, __F, __E,
1193				 __D, __C, __B, __A };
1194}
1195
1196/* Create the vector [A B C D E F G H].  */
1197extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1199		  int __E, int __F, int __G, int __H)
1200{
1201  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1202					  __D, __C, __B, __A };
1203}
1204
1205extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1207		  short __q11, short __q10, short __q09, short __q08,
1208		  short __q07, short __q06, short __q05, short __q04,
1209		  short __q03, short __q02, short __q01, short __q00)
1210{
1211  return __extension__ (__m256i)(__v16hi){
1212    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1213    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1214  };
1215}
1216
1217extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1219		  char __q27, char __q26, char __q25, char __q24,
1220		  char __q23, char __q22, char __q21, char __q20,
1221		  char __q19, char __q18, char __q17, char __q16,
1222		  char __q15, char __q14, char __q13, char __q12,
1223		  char __q11, char __q10, char __q09, char __q08,
1224		  char __q07, char __q06, char __q05, char __q04,
1225		  char __q03, char __q02, char __q01, char __q00)
1226{
1227  return __extension__ (__m256i)(__v32qi){
1228    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1229    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1230    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1231    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1232  };
1233}
1234
1235extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236_mm256_set_epi64x (long long __A, long long __B, long long __C,
1237		   long long __D)
1238{
1239  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1240}
1241
1242/* Create a vector with all elements equal to A.  */
1243extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244_mm256_set1_pd (double __A)
1245{
1246  return __extension__ (__m256d){ __A, __A, __A, __A };
1247}
1248
1249/* Create a vector with all elements equal to A.  */
1250extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251_mm256_set1_ps (float __A)
1252{
1253  return __extension__ (__m256){ __A, __A, __A, __A,
1254				 __A, __A, __A, __A };
1255}
1256
1257/* Create a vector with all elements equal to A.  */
1258extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259_mm256_set1_epi32 (int __A)
1260{
1261  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1262					  __A, __A, __A, __A };
1263}
1264
1265extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266_mm256_set1_epi16 (short __A)
1267{
1268  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1269			   __A, __A, __A, __A, __A, __A, __A, __A);
1270}
1271
1272extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273_mm256_set1_epi8 (char __A)
1274{
1275  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1276			  __A, __A, __A, __A, __A, __A, __A, __A,
1277			  __A, __A, __A, __A, __A, __A, __A, __A,
1278			  __A, __A, __A, __A, __A, __A, __A, __A);
1279}
1280
1281extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282_mm256_set1_epi64x (long long __A)
1283{
1284  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1285}
1286
1287/* Create vectors of elements in the reversed order from the
1288   _mm256_set_XXX functions.  */
1289
1290extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291_mm256_setr_pd (double __A, double __B, double __C, double __D)
1292{
1293  return _mm256_set_pd (__D, __C, __B, __A);
1294}
1295
1296extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297_mm256_setr_ps (float __A, float __B, float __C, float __D,
1298		float __E, float __F, float __G, float __H)
1299{
1300  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1301}
1302
1303extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1305		   int __E, int __F, int __G, int __H)
1306{
1307  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1308}
1309
1310extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1312		   short __q11, short __q10, short __q09, short __q08,
1313		   short __q07, short __q06, short __q05, short __q04,
1314		   short __q03, short __q02, short __q01, short __q00)
1315{
1316  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1317			   __q04, __q05, __q06, __q07,
1318			   __q08, __q09, __q10, __q11,
1319			   __q12, __q13, __q14, __q15);
1320}
1321
1322extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1324		   char __q27, char __q26, char __q25, char __q24,
1325		   char __q23, char __q22, char __q21, char __q20,
1326		   char __q19, char __q18, char __q17, char __q16,
1327		   char __q15, char __q14, char __q13, char __q12,
1328		   char __q11, char __q10, char __q09, char __q08,
1329		   char __q07, char __q06, char __q05, char __q04,
1330		   char __q03, char __q02, char __q01, char __q00)
1331{
1332  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1333			  __q04, __q05, __q06, __q07,
1334			  __q08, __q09, __q10, __q11,
1335			  __q12, __q13, __q14, __q15,
1336			  __q16, __q17, __q18, __q19,
1337			  __q20, __q21, __q22, __q23,
1338			  __q24, __q25, __q26, __q27,
1339			  __q28, __q29, __q30, __q31);
1340}
1341
1342extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1344		    long long __D)
1345{
1346  return _mm256_set_epi64x (__D, __C, __B, __A);
1347}
1348
1349/* Casts between various SP, DP, INT vector types.  Note that these do no
1350   conversion of values, they just change the type.  */
1351extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352_mm256_castpd_ps (__m256d __A)
1353{
1354  return (__m256) __A;
1355}
1356
1357extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358_mm256_castpd_si256 (__m256d __A)
1359{
1360  return (__m256i) __A;
1361}
1362
1363extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364_mm256_castps_pd (__m256 __A)
1365{
1366  return (__m256d) __A;
1367}
1368
1369extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370_mm256_castps_si256(__m256 __A)
1371{
1372  return (__m256i) __A;
1373}
1374
1375extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376_mm256_castsi256_ps (__m256i __A)
1377{
1378  return (__m256) __A;
1379}
1380
1381extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382_mm256_castsi256_pd (__m256i __A)
1383{
1384  return (__m256d) __A;
1385}
1386
1387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388_mm256_castpd256_pd128 (__m256d __A)
1389{
1390  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1391}
1392
1393extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394_mm256_castps256_ps128 (__m256 __A)
1395{
1396  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1397}
1398
1399extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400_mm256_castsi256_si128 (__m256i __A)
1401{
1402  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1403}
1404
1405/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1406   the 256-bit result contain source parameter value and the upper 128
1407   bits of the result are undefined.  Those intrinsics shouldn't
1408   generate any extra moves.  */
1409
1410extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411_mm256_castpd128_pd256 (__m128d __A)
1412{
1413  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1414}
1415
1416extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417_mm256_castps128_ps256 (__m128 __A)
1418{
1419  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1420}
1421
1422extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423_mm256_castsi128_si256 (__m128i __A)
1424{
1425  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1426}
1427