155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes *
355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * Permission is hereby granted, free of charge, to any person obtaining a copy
455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * of this software and associated documentation files (the "Software"), to deal
555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * in the Software without restriction, including without limitation the rights
655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * copies of the Software, and to permit persons to whom the Software is
855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * furnished to do so, subject to the following conditions:
955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes *
1055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * The above copyright notice and this permission notice shall be included in
1155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * all copies or substantial portions of the Software.
1255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes *
1355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes * THE SOFTWARE.
2055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes *
2155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes *===-----------------------------------------------------------------------===
2255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes */
2355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
2401b57e362a2c8abb18ba6139ca212e6c7f2288b0Benjamin Kramer#ifndef __IMMINTRIN_H
2501b57e362a2c8abb18ba6139ca212e6c7f2288b0Benjamin Kramer#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
2601b57e362a2c8abb18ba6139ca212e6c7f2288b0Benjamin Kramer#endif
2755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
2855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef double __v4df __attribute__ ((__vector_size__ (32)));
2955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef float __v8sf __attribute__ ((__vector_size__ (32)));
3055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef long long __v4di __attribute__ ((__vector_size__ (32)));
3155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef int __v8si __attribute__ ((__vector_size__ (32)));
3255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef short __v16hi __attribute__ ((__vector_size__ (32)));
3355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef char __v32qi __attribute__ ((__vector_size__ (32)));
3455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
3555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef float __m256 __attribute__ ((__vector_size__ (32)));
3655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef double __m256d __attribute__((__vector_size__(32)));
3755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopestypedef long long __m256i __attribute__((__vector_size__(32)));
3855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
3955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Arithmetic */
4055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
4155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_add_pd(__m256d a, __m256d b)
4255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
4355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a+b;
4455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
4555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
4655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
4755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_add_ps(__m256 a, __m256 b)
4855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
4955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a+b;
5055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
5155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
5255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
5355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_sub_pd(__m256d a, __m256d b)
5455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
5555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a-b;
5655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
5755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
5855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
5955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_sub_ps(__m256 a, __m256 b)
6055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
6155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a-b;
6255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
6355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
6455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
6555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_addsub_pd(__m256d a, __m256d b)
6655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
6755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
6855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
6955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
7055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
7155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_addsub_ps(__m256 a, __m256 b)
7255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
7355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
7455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
7555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
7655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
7755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_div_pd(__m256d a, __m256d b)
7855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
7955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a / b;
8055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
8155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
8255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
8355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_div_ps(__m256 a, __m256 b)
8455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
8555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a / b;
8655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
8755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
8855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
8955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_max_pd(__m256d a, __m256d b)
9055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
9155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
9255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
9355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
9455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
9555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_max_ps(__m256 a, __m256 b)
9655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
9755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
9855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
9955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
10055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
10155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_min_pd(__m256d a, __m256d b)
10255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
10355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
10455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
10555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
10655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
10755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_min_ps(__m256 a, __m256 b)
10855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
10955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
11055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
11155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
11255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
11355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_mul_pd(__m256d a, __m256d b)
11455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a * b;
11655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
11755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
11855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
11955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_mul_ps(__m256 a, __m256 b)
12055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
12155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return a * b;
12255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
12355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
12455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
12555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_sqrt_pd(__m256d a)
12655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
12755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
12855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
12955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
13055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
13155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_sqrt_ps(__m256 a)
13255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
13355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
13455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
13555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
13655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
13755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_rsqrt_ps(__m256 a)
13855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
13955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
14055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
14155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
14255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
14355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_rcp_ps(__m256 a)
14455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
14555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
14655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
14755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
148b8786c4dc4d5a4c72f23a2d46cac5f9bc2641926Chad Rosier#define _mm256_round_pd(V, M) __extension__ ({ \
149b8786c4dc4d5a4c72f23a2d46cac5f9bc2641926Chad Rosier    __m256d __V = (V); \
15034a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
15155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
152b8786c4dc4d5a4c72f23a2d46cac5f9bc2641926Chad Rosier#define _mm256_round_ps(V, M) __extension__ ({ \
153b8786c4dc4d5a4c72f23a2d46cac5f9bc2641926Chad Rosier  __m256 __V = (V); \
15434a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
15555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
15655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
15755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
15855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
15955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
16055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
16155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Logical */
16255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
16355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_and_pd(__m256d a, __m256d b)
16455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
165da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256d)((__v4di)a & (__v4di)b);
16655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
16755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
16855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
16955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_and_ps(__m256 a, __m256 b)
17055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
171da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256)((__v8si)a & (__v8si)b);
17255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
17355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
17455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
17555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_andnot_pd(__m256d a, __m256d b)
17655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
177da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256d)(~(__v4di)a & (__v4di)b);
17855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
17955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
18055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
18155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_andnot_ps(__m256 a, __m256 b)
18255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
183da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256)(~(__v8si)a & (__v8si)b);
18455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
18555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
18655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
18755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_or_pd(__m256d a, __m256d b)
18855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
189da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256d)((__v4di)a | (__v4di)b);
19055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
19155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
19255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
19355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_or_ps(__m256 a, __m256 b)
19455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
195da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256)((__v8si)a | (__v8si)b);
19655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
19755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
19855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
19955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_xor_pd(__m256d a, __m256d b)
20055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
201da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256d)((__v4di)a ^ (__v4di)b);
20255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
20355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
20455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
20555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_xor_ps(__m256 a, __m256 b)
20655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
207da6adc43542616701a406bf767608ea2de929019Bruno Cardoso Lopes  return (__m256)((__v8si)a ^ (__v8si)b);
20855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
20955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
21055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Horizontal arithmetic */
21155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
21255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_hadd_pd(__m256d a, __m256d b)
21355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
21455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
21555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
21655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
21755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
21855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_hadd_ps(__m256 a, __m256 b)
21955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
22055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
22155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
22255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
22355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
22455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_hsub_pd(__m256d a, __m256d b)
22555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
22655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
22755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
22855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
22955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
23055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_hsub_ps(__m256 a, __m256 b)
23155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
23255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
23355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
23455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
23555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector permutations */
23655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128d __attribute__((__always_inline__, __nodebug__))
23755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_permutevar_pd(__m128d a, __m128i c)
23855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
23955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
24055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
24155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
24255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
24355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_permutevar_pd(__m256d a, __m256i c)
24455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
24555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
24655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
24755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
24855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128 __attribute__((__always_inline__, __nodebug__))
24955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_permutevar_ps(__m128 a, __m128i c)
25055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
25155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
25255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
25355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
25455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
25555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_permutevar_ps(__m256 a, __m256i c)
25655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
25755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
25855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes						  (__v8si)c);
25955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
26055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
261c17f88efa20c9e12c7e07bf02041fd1f0e65d65bChad Rosier#define _mm_permute_pd(A, C) __extension__ ({ \
262c17f88efa20c9e12c7e07bf02041fd1f0e65d65bChad Rosier  __m128d __A = (A); \
26310c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
26410c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                   (C) & 0x1, ((C) & 0x2) >> 1); })
26555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
266c17f88efa20c9e12c7e07bf02041fd1f0e65d65bChad Rosier#define _mm256_permute_pd(A, C) __extension__ ({ \
267c17f88efa20c9e12c7e07bf02041fd1f0e65d65bChad Rosier  __m256d __A = (A); \
26810c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
26910c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                   (C) & 0x1, ((C) & 0x2) >> 1, \
27010c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                   2 + (((C) & 0x4) >> 2), \
27110c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                   2 + (((C) & 0x8) >> 3)); })
27255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
273d7dd7755fc5092c69f492d6f32cb0e34e63c6a53Chad Rosier#define _mm_permute_ps(A, C) __extension__ ({ \
274d7dd7755fc5092c69f492d6f32cb0e34e63c6a53Chad Rosier  __m128 __A = (A); \
27510c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
27610c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                   (C) & 0x3, ((C) & 0xc) >> 2, \
2775629646711d9c748feb1043a7df2d5ca7d1bdfc4Craig Topper                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
27855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
279d7dd7755fc5092c69f492d6f32cb0e34e63c6a53Chad Rosier#define _mm256_permute_ps(A, C) __extension__ ({ \
280d7dd7755fc5092c69f492d6f32cb0e34e63c6a53Chad Rosier  __m256 __A = (A); \
28110c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
28210c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  (C) & 0x3, ((C) & 0xc) >> 2, \
28310c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
28410c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  4 + (((C) & 0x03) >> 0), \
28510c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  4 + (((C) & 0x0c) >> 2), \
28610c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  4 + (((C) & 0x30) >> 4), \
28710c57a87d97adb0390c1dd0a69feb7862d5db4a3Craig Topper                                  4 + (((C) & 0xc0) >> 6)); })
28855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
289c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256d __V1 = (V1); \
291c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256d __V2 = (V2); \
29249a110db4c43835681bb89671f8f73c8d8c7c28cCraig Topper  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
29355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
294c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
295c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256 __V1 = (V1); \
296c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256 __V2 = (V2); \
29749a110db4c43835681bb89671f8f73c8d8c7c28cCraig Topper  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
29855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
299c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
300c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256i __V1 = (V1); \
301c5cda1121e270548ecf258d0ed72919a5211a94eChad Rosier  __m256i __V2 = (V2); \
30249a110db4c43835681bb89671f8f73c8d8c7c28cCraig Topper  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
30355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
30455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector Blend */
305347208968c303a9c11fe29012f6dc49680465182Eli Friedman#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
306347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256d __V1 = (V1); \
307347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256d __V2 = (V2); \
30834a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
30955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
310347208968c303a9c11fe29012f6dc49680465182Eli Friedman#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
311347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256 __V1 = (V1); \
312347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256 __V2 = (V2); \
31334a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
31455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
31555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
31655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
31755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
31855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
31955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
32055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
32155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
32255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
32355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
32455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
32555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
32655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
32755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector Dot Product */
328347208968c303a9c11fe29012f6dc49680465182Eli Friedman#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
329347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256 __V1 = (V1); \
330347208968c303a9c11fe29012f6dc49680465182Eli Friedman  __m256 __V2 = (V2); \
33134a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
33255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
33355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector shuffle */
33432bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
33532bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        __m256 __a = (a); \
33632bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        __m256 __b = (b); \
33732bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
338b33aa0f7dfa3a6cadc8ac1ac910f36680cbf7a76Bruno Cardoso Lopes        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
33970141c2d11ba555ff5922d8b4a014be2f629e2ecBruno Cardoso Lopes        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
340426344dc225978deaa79545e8e14366fa4f8e68dBruno Cardoso Lopes        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
34132bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
342b33aa0f7dfa3a6cadc8ac1ac910f36680cbf7a76Bruno Cardoso Lopes
34332bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
34432bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        __m256d __a = (a); \
34532bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        __m256d __b = (b); \
34632bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
347b33aa0f7dfa3a6cadc8ac1ac910f36680cbf7a76Bruno Cardoso Lopes        (mask) & 0x1, \
348b33aa0f7dfa3a6cadc8ac1ac910f36680cbf7a76Bruno Cardoso Lopes        (((mask) & 0x2) >> 1) + 4, \
349b33aa0f7dfa3a6cadc8ac1ac910f36680cbf7a76Bruno Cardoso Lopes        (((mask) & 0x4) >> 2) + 2, \
35032bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson        (((mask) & 0x8) >> 3) + 6); })
35155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
35255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Compare */
35355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
35455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
35555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
35655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
35755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
35855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
35955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
36055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
36155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
36255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
36355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
36455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
36555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
36655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
36755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
36855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
36955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
37055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
37155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
37255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
37355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
37455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
37555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
37655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
37755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
37855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
37955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
38055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
38155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
38255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
38355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
38455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
38555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
38632bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm_cmp_pd(a, b, c) __extension__ ({ \
38732bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128d __a = (a); \
38832bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128d __b = (b); \
38932bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
39032bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson
39132bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm_cmp_ps(a, b, c) __extension__ ({ \
39232bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128 __a = (a); \
39332bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128 __b = (b); \
39432bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
39532bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson
39632bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
39732bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m256d __a = (a); \
39832bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m256d __b = (b); \
39932bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
40032bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson
40132bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
40232bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m256 __a = (a); \
40332bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m256 __b = (b); \
40432bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
40532bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson
40632bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm_cmp_sd(a, b, c) __extension__ ({ \
40732bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128d __a = (a); \
40832bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128d __b = (b); \
40932bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
41032bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson
41132bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson#define _mm_cmp_ss(a, b, c) __extension__ ({ \
41232bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128 __a = (a); \
41332bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  __m128 __b = (b); \
41432bae37b821e6ade738849ac14e3d3de06afb0beBob Wilson  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
41555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
41655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector extract */
4171e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier#define _mm256_extractf128_pd(A, O) __extension__ ({ \
4181e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier  __m256d __A = (A); \
41934a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
42055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
4211e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier#define _mm256_extractf128_ps(A, O) __extension__ ({ \
4221e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier  __m256 __A = (A); \
42334a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
42455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
4251e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier#define _mm256_extractf128_si256(A, O) __extension__ ({ \
4261e4faf56cd310dbd89b7d192db57c3d120bec8a2Chad Rosier  __m256i __A = (A); \
42734a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
42855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
42955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
43055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_extract_epi32(__m256i a, int const imm)
43155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
43255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v8si b = (__v8si)a;
43355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return b[imm];
43455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
43555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
43655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
43755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_extract_epi16(__m256i a, int const imm)
43855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
43955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v16hi b = (__v16hi)a;
44055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return b[imm];
44155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
44255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
44355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
44455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_extract_epi8(__m256i a, int const imm)
44555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
44655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v32qi b = (__v32qi)a;
44755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return b[imm];
44855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
44955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
45055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#ifdef __x86_64__
45155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline long long  __attribute__((__always_inline__, __nodebug__))
45255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_extract_epi64(__m256i a, const int imm)
45355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
45455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v4di b = (__v4di)a;
45555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return b[imm];
45655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
45755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#endif
45855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
45955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector insert */
460b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
461b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m256d __V1 = (V1); \
462b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m128d __V2 = (V2); \
46334a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
46455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
465b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
466b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m256 __V1 = (V1); \
467b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m128 __V2 = (V2); \
46834a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
46955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
470b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
471b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m256i __V1 = (V1); \
472b95ddf15e75a6ea27f10c410bbc7a82308b19f4bChad Rosier  __m128i __V2 = (V2); \
47334a1da4354959522cd1721ce9ca099cc5c743f01Craig Topper  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
47455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
47555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
47655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_insert_epi32(__m256i a, int b, int const imm)
47755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
47855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v8si c = (__v8si)a;
47955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  c[imm & 7] = b;
48055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)c;
48155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
48255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
48355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
48455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_insert_epi16(__m256i a, int b, int const imm)
48555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
48655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v16hi c = (__v16hi)a;
48755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  c[imm & 15] = b;
48855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)c;
48955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
49055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
49155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
49255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_insert_epi8(__m256i a, int b, int const imm)
49355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
49455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v32qi c = (__v32qi)a;
49555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  c[imm & 31] = b;
49655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)c;
49755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
49855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
49955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#ifdef __x86_64__
50055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
50155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_insert_epi64(__m256i a, int b, int const imm)
50255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
50355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __v4di c = (__v4di)a;
50455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  c[imm & 3] = b;
50555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)c;
50655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
50755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes#endif
50855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
50955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Conversion */
51055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
51155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtepi32_pd(__m128i a)
51255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
51355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
51455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
51555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
51655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
51755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtepi32_ps(__m256i a)
51855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
51955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
52055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
52155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
52255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128 __attribute__((__always_inline__, __nodebug__))
52355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtpd_ps(__m256d a)
52455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
52555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
52655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
52755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
52855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
52955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtps_epi32(__m256 a)
53055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
53155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
53255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
53355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
53455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
53555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtps_pd(__m128 a)
53655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
53755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
53855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
53955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
54055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128i __attribute__((__always_inline__, __nodebug__))
54155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvttpd_epi32(__m256d a)
54255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
54355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
54455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
54555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
54655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128i __attribute__((__always_inline__, __nodebug__))
54755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvtpd_epi32(__m256d a)
54855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
54955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
55055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
55155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
55255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
55355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_cvttps_epi32(__m256 a)
55455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
55555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
55655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
55755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
55855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector replicate */
55955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
56055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_movehdup_ps(__m256 a)
56155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
5624a5496bdd50f6cec5f8eb252665503e5431708d9Bruno Cardoso Lopes  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
56355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
56455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
56555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
56655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_moveldup_ps(__m256 a)
56755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
5684a5496bdd50f6cec5f8eb252665503e5431708d9Bruno Cardoso Lopes  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
56955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
57055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
57155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
57255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_movedup_pd(__m256d a)
57355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
5744a5496bdd50f6cec5f8eb252665503e5431708d9Bruno Cardoso Lopes  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
57555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
57655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
57755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Unpack and Interleave */
57855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
57955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_unpackhi_pd(__m256d a, __m256d b)
58055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
581f0e96c925858a513c275f0aec89f049e065c78dbBruno Cardoso Lopes  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
58255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
58355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
58455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
58555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_unpacklo_pd(__m256d a, __m256d b)
58655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
587f0e96c925858a513c275f0aec89f049e065c78dbBruno Cardoso Lopes  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
58855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
58955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
59055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
59155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_unpackhi_ps(__m256 a, __m256 b)
59255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
593f0e96c925858a513c275f0aec89f049e065c78dbBruno Cardoso Lopes  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
59455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
59555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
59655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
59755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_unpacklo_ps(__m256 a, __m256 b)
59855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
599f0e96c925858a513c275f0aec89f049e065c78dbBruno Cardoso Lopes  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
60055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
60155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
60255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Bit Test */
60355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
60455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testz_pd(__m128d a, __m128d b)
60555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
60655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
60755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
60855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
60955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
61055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testc_pd(__m128d a, __m128d b)
61155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
61255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
61355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
61455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
61555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
61655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testnzc_pd(__m128d a, __m128d b)
61755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
61855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
61955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
62055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
62155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
62255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testz_ps(__m128 a, __m128 b)
62355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
62455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
62555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
62655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
62755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
62855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testc_ps(__m128 a, __m128 b)
62955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
63055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
63155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
63255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
63355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
63455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_testnzc_ps(__m128 a, __m128 b)
63555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
63655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
63755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
63855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
63955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
64055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testz_pd(__m256d a, __m256d b)
64155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
64255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
64355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
64455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
64555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
64655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testc_pd(__m256d a, __m256d b)
64755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
64855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
64955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
65055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
65155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
65255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testnzc_pd(__m256d a, __m256d b)
65355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
65455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
65555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
65655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
65755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
65855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testz_ps(__m256 a, __m256 b)
65955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
66055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
66155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
66255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
66355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
66455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testc_ps(__m256 a, __m256 b)
66555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
66655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
66755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
66855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
66955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
67055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testnzc_ps(__m256 a, __m256 b)
67155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
67255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
67355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
67455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
67555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
67655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testz_si256(__m256i a, __m256i b)
67755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
67855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
67955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
68055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
68155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
68255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testc_si256(__m256i a, __m256i b)
68355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
68455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
68555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
68655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
68755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
68855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_testnzc_si256(__m256i a, __m256i b)
68955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
69055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
69155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
69255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
69355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector extract sign mask */
69455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
69555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_movemask_pd(__m256d a)
69655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
69755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_movmskpd256((__v4df)a);
69855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
69955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
70055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline int __attribute__((__always_inline__, __nodebug__))
70155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_movemask_ps(__m256 a)
70255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
70355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return __builtin_ia32_movmskps256((__v8sf)a);
70455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
70555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
70655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector zero */
70755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
70855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_zeroall(void)
70955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
71055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_vzeroall();
71155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
71255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
71355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
71455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_zeroupper(void)
71555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
71655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_vzeroupper();
71755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
71855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
71955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Vector load with broadcast */
72055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128 __attribute__((__always_inline__, __nodebug__))
72155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_broadcast_ss(float const *a)
72255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
72355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128)__builtin_ia32_vbroadcastss(a);
72455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
72555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
72655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
72755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_broadcast_sd(double const *a)
72855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
72955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
73055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
73155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
73255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
73355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_broadcast_ss(float const *a)
73455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
73555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_vbroadcastss256(a);
73655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
73755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
73855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
73955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_broadcast_pd(__m128d const *a)
74055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
74155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
74255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
74355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
74455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
74555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_broadcast_ps(__m128 const *a)
74655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
74755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
74855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
74955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
75055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* SIMD load ops */
75155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
75255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_load_pd(double const *p)
75355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
75455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return *(__m256d *)p;
75555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
75655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
75755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
75855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_load_ps(float const *p)
75955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
76055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return *(__m256 *)p;
76155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
76255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
76355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
76455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_loadu_pd(double const *p)
76555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
7662ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  struct __loadu_pd {
7672ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper    __m256d v;
7682ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  } __attribute__((packed, may_alias));
7692ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  return ((struct __loadu_pd*)p)->v;
77055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
77155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
77255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
77355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_loadu_ps(float const *p)
77455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
7752ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  struct __loadu_ps {
7762ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper    __m256 v;
7772ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  } __attribute__((packed, may_alias));
7782ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  return ((struct __loadu_ps*)p)->v;
77955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
78055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
78155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
78255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_load_si256(__m256i const *p)
78355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
78455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return *p;
78555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
78655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
78755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
78855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_loadu_si256(__m256i const *p)
78955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
7902ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  struct __loadu_si256 {
7912ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper    __m256i v;
7922ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  } __attribute__((packed, may_alias));
7932ee2ac2293f313dfe1c6eb7034527a92b5d23158Craig Topper  return ((struct __loadu_si256*)p)->v;
79455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
79555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
79655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
79755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_lddqu_si256(__m256i const *p)
79855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
79955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
80055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
80155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
80255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* SIMD store ops */
80355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
80455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_store_pd(double *p, __m256d a)
80555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
80655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  *(__m256d *)p = a;
80755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
80855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
80955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
81055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_store_ps(float *p, __m256 a)
81155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
81255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  *(__m256 *)p = a;
81355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
81455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
81555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
81655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_storeu_pd(double *p, __m256d a)
81755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
81855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_storeupd256(p, (__v4df)a);
81955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
82055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
82155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
82255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_storeu_ps(float *p, __m256 a)
82355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
82455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_storeups256(p, (__v8sf)a);
82555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
82655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
82755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
82855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_store_si256(__m256i *p, __m256i a)
82955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
83055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  *p = a;
83155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
83255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
83355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
83455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_storeu_si256(__m256i *p, __m256i a)
83555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
83655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
83755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
83855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
83955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Conditional load ops */
84055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128d __attribute__((__always_inline__, __nodebug__))
84155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_maskload_pd(double const *p, __m128d m)
84255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
84355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
84455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
84555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
84655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
84755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_maskload_pd(double const *p, __m256d m)
84855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
84955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
85055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
85155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
85255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128 __attribute__((__always_inline__, __nodebug__))
85355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_maskload_ps(float const *p, __m128 m)
85455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
85555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
85655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
85755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
85855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
85955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_maskload_ps(float const *p, __m256 m)
86055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
86155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
86255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
86355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
86455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Conditional store ops */
86555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
86655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
86755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
86855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
86955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
87055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
87155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
87255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_maskstore_pd(double *p, __m128d m, __m128d a)
87355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
87455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
87555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
87655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
87755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
87855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
87955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
88055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
88155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
88255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
88355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
88455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm_maskstore_ps(float *p, __m128 m, __m128 a)
88555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
88655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
88755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
88855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
88955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Cacheability support ops */
89055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
89155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_stream_si256(__m256i *a, __m256i b)
89255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
89355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
89455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
89555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
89655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
89755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_stream_pd(double *a, __m256d b)
89855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
89955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_movntpd256(a, (__v4df)b);
90055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
90155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
90255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline void __attribute__((__always_inline__, __nodebug__))
90355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_stream_ps(float *p, __m256 a)
90455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
90555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  __builtin_ia32_movntps256(p, (__v8sf)a);
90655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
90755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
90855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Create vectors */
90955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
91055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_pd(double a, double b, double c, double d)
91155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
91255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d){ d, c, b, a };
91355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
91455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
91555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
91655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_ps(float a, float b, float c, float d,
91755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes	            float e, float f, float g, float h)
91855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
91955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256){ h, g, f, e, d, c, b, a };
92055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
92155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
92255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
92355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_epi32(int i0, int i1, int i2, int i3,
92455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             int i4, int i5, int i6, int i7)
92555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
92655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
92755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
92855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
92955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
93055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_epi16(short w15, short w14, short w13, short w12,
93155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             short w11, short w10, short w09, short w08,
93255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             short w07, short w06, short w05, short w04,
93355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             short w03, short w02, short w01, short w00)
93455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
93555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
93655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes                             w08, w09, w10, w11, w12, w13, w14, w15 };
93755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
93855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
93955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
94055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_epi8(char b31, char b30, char b29, char b28,
94155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b27, char b26, char b25, char b24,
94255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b23, char b22, char b21, char b20,
94355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b19, char b18, char b17, char b16,
94455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b15, char b14, char b13, char b12,
94555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b11, char b10, char b09, char b08,
94655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b07, char b06, char b05, char b04,
94755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		            char b03, char b02, char b01, char b00)
94855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
94955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v32qi){
95055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes    b00, b01, b02, b03, b04, b05, b06, b07,
95155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes    b08, b09, b10, b11, b12, b13, b14, b15,
95255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes    b16, b17, b18, b19, b20, b21, b22, b23,
95355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes    b24, b25, b26, b27, b28, b29, b30, b31
95455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  };
95555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
95655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
95755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
95855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set_epi64x(long long a, long long b, long long c, long long d)
95955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
96055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v4di){ d, c, b, a };
96155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
96255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
96355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Create vectors with elements in reverse order */
96455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
96555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_pd(double a, double b, double c, double d)
96655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
96755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d){ a, b, c, d };
96855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
96955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
97055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
97155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_ps(float a, float b, float c, float d,
97255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		           float e, float f, float g, float h)
97355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
97455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256){ a, b, c, d, e, f, g, h };
97555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
97655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
97755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
97855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_epi32(int i0, int i1, int i2, int i3,
97955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		              int i4, int i5, int i6, int i7)
98055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
98155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
98255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
98355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
98455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
98555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_epi16(short w15, short w14, short w13, short w12,
98655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		   short w11, short w10, short w09, short w08,
98755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		   short w07, short w06, short w05, short w04,
98855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		   short w03, short w02, short w01, short w00)
98955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
99055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
99155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes			                       w07, w06, w05, w04, w03, w02, w01, w00 };
99255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
99355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
99455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
99555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_epi8(char b31, char b30, char b29, char b28,
99655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b27, char b26, char b25, char b24,
99755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b23, char b22, char b21, char b20,
99855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b19, char b18, char b17, char b16,
99955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b15, char b14, char b13, char b12,
100055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b11, char b10, char b09, char b08,
100155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b07, char b06, char b05, char b04,
100255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		             char b03, char b02, char b01, char b00)
100355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
100455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v32qi){
100555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes    b31, b30, b29, b28, b27, b26, b25, b24,
100655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		b23, b22, b21, b20, b19, b18, b17, b16,
100755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		b15, b14, b13, b12, b11, b10, b09, b08,
100855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes		b07, b06, b05, b04, b03, b02, b01, b00 };
100955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
101055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
101155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
101255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
101355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
101455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v4di){ a, b, c, d };
101555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
101655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
101755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Create vectors with repeated elements */
101855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
101955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_pd(double w)
102055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
102155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d){ w, w, w, w };
102255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
102355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
102455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
102555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_ps(float w)
102655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
102755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256){ w, w, w, w, w, w, w, w };
102855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
102955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
103055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
103155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_epi32(int i)
103255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
103355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
103455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
103555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
103655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
103755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_epi16(short w)
103855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
103955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
104055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
104155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
104255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
104355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_epi8(char b)
104455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
104555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
104655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
104755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
104855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
104955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
105055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_set1_epi64x(long long q)
105155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
105255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)(__v4di){ q, q, q, q };
105355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
105455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
105555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Create zeroed vectors */
105655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
105755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setzero_pd(void)
105855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
105955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d){ 0, 0, 0, 0 };
106055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
106155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
106255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
106355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setzero_ps(void)
106455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
106555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
106655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
106755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
106855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
106955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_setzero_si256(void)
107055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
107155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
107255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
107355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
107455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes/* Cast between vector types */
107555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
107655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castpd_ps(__m256d in)
107755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
107855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)in;
107955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
108055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
108155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
108255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castpd_si256(__m256d in)
108355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
108455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)in;
108555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
108655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
108755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
108855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castps_pd(__m256 in)
108955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
109055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)in;
109155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
109255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
109355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
109455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castps_si256(__m256 in)
109555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
109655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256i)in;
109755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
109855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
109955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
110055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castsi256_ps(__m256i in)
110155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
110255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256)in;
110355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
110455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
110555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
110655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castsi256_pd(__m256i in)
110755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
110855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes  return (__m256d)in;
110955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
111055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
111155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128d __attribute__((__always_inline__, __nodebug__))
111255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castpd256_pd128(__m256d in)
111355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11147fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, in, 0, 1);
111555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
111655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
111755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128 __attribute__((__always_inline__, __nodebug__))
111855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castps256_ps128(__m256 in)
111955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11207fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
112155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
112255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
112355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m128i __attribute__((__always_inline__, __nodebug__))
112455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castsi256_si128(__m256i in)
112555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11267fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, in, 0, 1);
112755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
112855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
112955db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
113055db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castpd128_pd256(__m128d in)
113155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11327fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  __m128d zero = _mm_setzero_pd();
11337fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
113455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
113555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
113655db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
113755db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castps128_ps256(__m128 in)
113855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11397fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  __m128 zero = _mm_setzero_ps();
11407fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
114155db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
114255db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes
114355db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopesstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
114455db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes_mm256_castsi128_si256(__m128i in)
114555db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes{
11467fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  __m128i zero = _mm_setzero_si128();
11477fc3702694996d7d373e3280812a4172cf451aacBruno Cardoso Lopes  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
114855db5b874416cde3f2601a717e25d0974bf02f80Bruno Cardoso Lopes}
1149db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1150db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier/* SIMD load ops (unaligned) */
1151db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline __m256 __attribute__((__always_inline__, __nodebug__))
1152db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
1153db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1154db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  struct __loadu_ps {
1155db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier    __m128 v;
1156db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  } __attribute__((__packed__, __may_alias__));
1157db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1158db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
1159db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
1160db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1161db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1162db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline __m256d __attribute__((__always_inline__, __nodebug__))
1163db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
1164db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1165db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  struct __loadu_pd {
1166db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier    __m128d v;
1167db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  } __attribute__((__packed__, __may_alias__));
1168db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1169db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
1170db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
1171db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1172db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1173db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline __m256i __attribute__((__always_inline__, __nodebug__))
1174db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
1175db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1176db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  struct __loadu_si128 {
1177db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier    __m128i v;
1178db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  } __attribute__((packed, may_alias));
1179db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
1180db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
1181db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1182db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1183db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier/* SIMD store ops (unaligned) */
1184db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline void __attribute__((__always_inline__, __nodebug__))
118541a7e89183116ea89c71d78564a4a7fd7712c0f6Chad Rosier_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
1186db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1187db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m128 v128;
1188db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1189db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_castps256_ps128(a);
1190db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storeups(addr_lo, v128);
1191db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_extractf128_ps(a, 1);
1192db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storeups(addr_hi, v128);
1193db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1194db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1195db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline void __attribute__((__always_inline__, __nodebug__))
119641a7e89183116ea89c71d78564a4a7fd7712c0f6Chad Rosier_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
1197db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1198db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m128d v128;
1199db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1200db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_castpd256_pd128(a);
1201db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storeupd(addr_lo, v128);
1202db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_extractf128_pd(a, 1);
1203db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storeupd(addr_hi, v128);
1204db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1205db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1206db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosierstatic __inline void __attribute__((__always_inline__, __nodebug__))
120741a7e89183116ea89c71d78564a4a7fd7712c0f6Chad Rosier_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
1208db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier{
1209db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __m128i v128;
1210db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier
1211db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_castsi256_si128(a);
1212db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
1213db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  v128 = _mm256_extractf128_si256(a, 1);
1214db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier  __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
1215db163c87f990653b59fcc5f6e4864b652f4a49bdChad Rosier}
1216