12b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This file is part of Eigen, a lightweight C++ template library
22b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// for linear algebra.
32b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
42b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
52b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
62b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This Source Code Form is subject to the terms of the Mozilla
72b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Public License v. 2.0. If a copy of the MPL was not distributed
82b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_PACKET_MATH_AVX_H
112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_PACKET_MATH_AVX_H
122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen {
142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal {
162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__
262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256  Packet8f;
322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256i Packet8i;
332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256d Packet4d;
342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256>  { enum { value = true }; };
362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256i> { enum { value = true }; };
372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256d> { enum { value = true }; };
382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet8i p8i_##NAME = pset1<Packet8i>(X)
502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// to leverage AVX512 instructions.
532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_VECTORIZE_AVX512
542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<float>  : default_packet_traits
552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8f type;
572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4f half;
582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size=8,
622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 1,
632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv  = 1,
652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSin  = EIGEN_FAST_MATH,
662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasCos  = 0,
672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog  = 1,
682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp  = 1,
692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 1,
702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 1,
712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasTanh  = EIGEN_FAST_MATH,
722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasBlend = 1,
732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRound = 1,
742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasFloor = 1,
752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasCeil = 1
762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<double> : default_packet_traits
792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4d type;
812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet2d half;
822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size=4,
862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 1,
872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv  = 1,
892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp  = 1,
902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 1,
912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 1,
922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasBlend = 1,
932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRound = 1,
942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasFloor = 1,
952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasCeil = 1
962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
1012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
1022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang/* Proper support for integers is only provided by AVX2. In the meantime, we'll
1042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang   use SSE instructions and packets to deal with integers.
1052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<int>    : default_packet_traits
1062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
1072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8i type;
1082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
1092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
1102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
1112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size=8
1122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
1132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
1142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang*/
1152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
1172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
1182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
1192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
1212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
1222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
1232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
1252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
1262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
1282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
1312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
1322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
1342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
1352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
1372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
1382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
1412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
1422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
1432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
1462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
1542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
1552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
1562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ eigen_assert(false && "packet integer division are not supported by AVX");
1572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pset1<Packet8i>(0);
1582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__
1612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
1622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
1632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
1642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // and gcc stupidly generates a vfmadd132ps instruction,
1652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
1662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // the result of the product.
1672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f res = c;
1682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
1692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
1702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_fmadd_ps(a,b,c);
1722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
1752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
1762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // see above
1772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d res = c;
1782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
1792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
1802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_fmadd_pd(a,b,c);
1822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
1872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
1882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
1902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
1912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
1932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
1942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
1962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
1972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
1992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
2002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
2022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
2032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
2062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
2082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
2112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
2122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
2152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
2162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
2182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
2192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
2202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
2222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
2232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // TODO try to find a way to avoid the need of a temporary register
2252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
2272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//   return _mm256_unpacklo_ps(tmp,tmp);
2282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // _mm256_insertf128_ps is very slow on Haswell, thus:
2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
2312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // mimic an "inplace" permutation of the lower 128bits using a blend
2322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
2332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // then we can perform a consistent permutation on the global register to get everything in shape:
2342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
2372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return  _mm256_permute_pd(tmp, 3<<2);
2412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
2442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
2452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
2472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
2512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
2522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
2552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
2562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
2572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
2592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
2602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
2612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
2632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
2642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
2662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
2682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
2712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128 low = _mm256_extractf128_ps(from, 0);
2732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0] = _mm_cvtss_f32(low);
2742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
2752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
2762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
2772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128 high = _mm256_extractf128_ps(from, 1);
2792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*4] = _mm_cvtss_f32(high);
2802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
2812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
2832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
2852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128d low = _mm256_extractf128_pd(from, 0);
2872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0] = _mm_cvtsd_f64(low);
2882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
2892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128d high = _mm256_extractf128_pd(from, 1);
2902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*2] = _mm_cvtsd_f64(high);
2912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
2922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
2952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
2962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f pa = pset1<Packet8f>(a);
2972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
2982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
3002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d pa = pset1<Packet4d>(a);
3022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
3032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
3052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i pa = pset1<Packet8i>(a);
3072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
3082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_VECTORIZE_AVX512
3112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
3122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
3132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
3142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
3172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
3182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
3202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
3212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
3232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
3242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
3302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_permute2f128_ps(tmp, tmp, 1);
3312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang   __m256d tmp = _mm256_shuffle_pd(a,a,5);
3352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_permute2f128_pd(tmp, tmp, 1);
3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    return _mm256_permute_pd(swap_halves,5);
3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// pabs should be ok
3422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
3432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_and_ps(a,mask);
3462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
3482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
3502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_and_pd(a,mask);
3512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// preduxp should be ok
3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// FIXME: why is this ok? why isn't the simply implementation working as expected?
3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
3582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
3592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
3602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
3612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
3632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
3642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
3652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
3662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
3682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
3692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
3702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
3712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
3732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
3742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
3752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
3762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
3782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
3792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    return final;
3822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
3842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp0, tmp1;
3862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
3882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
3892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
3912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
3922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blend_pd(tmp0, tmp1, 0xC);
3942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
3972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
3982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
3992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
4012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
4032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
4062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
4082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
4112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp;
4132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
4142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
4152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
4162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
4182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp;
4202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
4212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
4222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
4252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
4272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
4282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
4292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
4312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
4332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
4342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
4372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
4392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
4402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
4412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
4442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
4462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<int Offset>
4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset,Packet8f>
4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
4542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  {
4552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    if (Offset==1)
4562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 1);
4582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==2)
4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 3);
4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
4662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==3)
4702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 7);
4722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==4)
4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 15);
4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==5)
4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 31);
4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(first, first, 1);
4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(tmp, tmp, 1);
4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp, first, 0x88);
4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==6)
4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 63);
4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(first, first, 1);
4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(tmp, tmp, 1);
4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp, first, 0xcc);
4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==7)
5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(first, second, 127);
5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(first, first, 1);
5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_ps(tmp, tmp, 1);
5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_ps(tmp, first, 0xee);
5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<int Offset>
5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset,Packet4d>
5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  {
5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    if (Offset==1)
5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_pd(first, second, 1);
5182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m256d tmp = _mm256_permute_pd(first, 5);
5192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_pd(tmp, tmp, 1);
5202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_pd(tmp, first, 0xA);
5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==2)
5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_pd(first, second, 3);
5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_pd(first, first, 1);
5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    else if (Offset==3)
5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    {
5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_pd(first, second, 7);
5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m256d tmp = _mm256_permute_pd(first, 5);
5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_permute2f128_pd(tmp, tmp, 1);
5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm256_blend_pd(tmp, first, 5);
5332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
5342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
5352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void
5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8f,8>& kernel) {
5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
5602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
5612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
5622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
5632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void
5662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8f,4>& kernel) {
5672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
5682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
5692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
5702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
5712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
5732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
5742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
5752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
5762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
5782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
5792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
5802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
5812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void
5842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet4d,4>& kernel) {
5852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
5862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
5872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
5882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
5892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
5912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
5922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
5932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
5942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
5972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const __m256 zero = _mm256_setzero_ps();
5982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
5992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
6002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
6012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
6032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const __m256d zero = _mm256_setzero_pd();
6042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
6052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
6062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
6072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
6102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
6122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
6152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
6172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
6202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
6222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
6252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
6272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace internal
6302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace Eigen
6322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif // EIGEN_PACKET_MATH_AVX_H
634