12b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This file is part of Eigen, a lightweight C++ template library 22b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// for linear algebra. 32b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// 42b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) 52b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// 62b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This Source Code Form is subject to the terms of the Mozilla 72b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Public License v. 2.0. If a copy of the MPL was not distributed 82b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_PACKET_MATH_AVX_H 112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_PACKET_MATH_AVX_H 122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen { 142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal { 162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) 232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__ 262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256 Packet8f; 322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256i Packet8i; 332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m256d Packet4d; 342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256> { enum { value = true }; }; 362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256i> { enum { value = true }; }; 372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<__m256d> { enum { value = true }; }; 382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ 402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet8f p8f_##NAME = pset1<Packet8f>(X) 412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ 432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet4d p4d_##NAME = pset1<Packet4d>(X) 442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ 462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X)) 472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ 492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet8i p8i_##NAME = pset1<Packet8i>(X) 502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going 522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// to leverage AVX512 instructions. 532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_VECTORIZE_AVX512 542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<float> : default_packet_traits 552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet8f type; 572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet4f half; 582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size=8, 622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 1, 632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 1, 652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSin = EIGEN_FAST_MATH, 662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasCos = 0, 672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog = 1, 682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 1, 692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 1, 702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 1, 712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasTanh = EIGEN_FAST_MATH, 722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasBlend = 1, 732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRound = 1, 742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasFloor = 1, 752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasCeil = 1 762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<double> : default_packet_traits 792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet4d type; 812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet2d half; 822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size=4, 862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 1, 872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 1, 892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 1, 902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 1, 912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 1, 922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasBlend = 1, 932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRound = 1, 942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasFloor = 1, 952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasCeil = 1 962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct scalar_div_cost<float,true> { enum { value = 14 }; }; 1012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct scalar_div_cost<double,true> { enum { value = 16 }; }; 1022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang/* Proper support for integers is only provided by AVX2. In the meantime, we'll 1042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang use SSE instructions and packets to deal with integers. 1052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<int> : default_packet_traits 1062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 1072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet8i type; 1082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 1092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 1102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 1112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size=8 1122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 1132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 1142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang*/ 1152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; 1172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; 1182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; 1192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } 1212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } 1222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); } 1232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); } 1252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); } 1262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } 1282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } 1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } 1312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } 1322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } 1342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } 1352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) 1372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 1382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_sub_ps(_mm256_set1_ps(0.0),a); 1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) 1412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 1422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_sub_pd(_mm256_set1_pd(0.0),a); 1432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } 1462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } 1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } 1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } 1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } 1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } 1542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } 1552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) 1562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ eigen_assert(false && "packet integer division are not supported by AVX"); 1572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pset1<Packet8i>(0); 1582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__ 1612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { 1622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) 1632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, 1642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // and gcc stupidly generates a vfmadd132ps instruction, 1652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate 1662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // the result of the product. 1672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f res = c; 1682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 1692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return res; 1702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_fmadd_ps(a,b,c); 1722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { 1752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) 1762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // see above 1772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d res = c; 1782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 1792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return res; 1802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_fmadd_pd(a,b,c); 1822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } 1872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } 1882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } 1902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } 1912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } 1932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } 1942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); } 1962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); } 1972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } 1992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); } 2002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } 2022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } 2032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } 2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } 2062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } 2082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } 2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } 2112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } 2122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } 2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } 2152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); } 2162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } 2182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } 2192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } 2202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} 2222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) 2232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // TODO try to find a way to avoid the need of a temporary register 2252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); 2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); 2272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// return _mm256_unpacklo_ps(tmp,tmp); 2282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // _mm256_insertf128_ps is very slow on Haswell, thus: 2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); 2312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // mimic an "inplace" permutation of the lower 128bits using a blend 2322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); 2332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // then we can perform a consistent permutation on the global register to get everything in shape: 2342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); 2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} 2372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) 2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); 2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_permute_pd(tmp, 3<<2); 2412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} 2442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) 2452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); 2472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); 2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } 2512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } 2522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } 2552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } 2562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 2572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available 2592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); 2602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) 2612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], 2632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 2642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) 2662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 2682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) 2712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128 low = _mm256_extractf128_ps(from, 0); 2732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0] = _mm_cvtss_f32(low); 2742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); 2752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); 2762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); 2772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128 high = _mm256_extractf128_ps(from, 1); 2792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*4] = _mm_cvtss_f32(high); 2802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); 2812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); 2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); 2832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) 2852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128d low = _mm256_extractf128_pd(from, 0); 2872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0] = _mm_cvtsd_f64(low); 2882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); 2892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128d high = _mm256_extractf128_pd(from, 1); 2902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*2] = _mm_cvtsd_f64(high); 2912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); 2922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) 2952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 2962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f pa = pset1<Packet8f>(a); 2972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(to, pa); 2982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) 3002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d pa = pset1<Packet4d>(a); 3022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(to, pa); 3032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) 3052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8i pa = pset1<Packet8i>(a); 3072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(to, pa); 3082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_VECTORIZE_AVX512 3112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 3122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 3132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 3142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 3152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { 3172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_cvtss_f32(_mm256_castps256_ps128(a)); 3182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { 3202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); 3212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { 3232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); 3242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) 3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); 3302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_permute2f128_ps(tmp, tmp, 1); 3312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) 3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d tmp = _mm256_shuffle_pd(a,a,5); 3352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_permute2f128_pd(tmp, tmp, 1); 3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); 3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_permute_pd(swap_halves,5); 3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// pabs should be ok 3422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) 3432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); 3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_and_ps(a,mask); 3462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) 3482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); 3502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_and_pd(a,mask); 3512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// preduxp should be ok 3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// FIXME: why is this ok? why isn't the simply implementation working as expected? 3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) 3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); 3582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); 3592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); 3602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); 3612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 3632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 3642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 3652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 3662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 3682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 3692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 3702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 3712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 sum1 = _mm256_add_ps(perm1, hsum5); 3732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 sum2 = _mm256_add_ps(perm2, hsum6); 3742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 sum3 = _mm256_add_ps(perm3, hsum7); 3752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 sum4 = _mm256_add_ps(perm4, hsum8); 3762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 3782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 3792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); 3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return final; 3822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs) 3842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp0, tmp1; 3862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); 3882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 3892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); 3912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 3922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blend_pd(tmp0, tmp1, 0xC); 3942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) 3972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 3982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); 3992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) 4012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); 4032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a) 4062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); 4082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) 4112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp; 4132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); 4142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 4152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 4162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) 4182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp; 4202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); 4212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); 4222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) 4252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); 4272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 4282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 4292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) 4312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); 4332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 4342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) 4372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); 4392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 4402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 4412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) 4442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); 4462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<int Offset> 4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset,Packet8f> 4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) 4542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang if (Offset==1) 4562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 1); 4582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp1, tmp2, 0x88); 4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==2) 4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 3); 4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 4662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp1, tmp2, 0xcc); 4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==3) 4702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 7); 4722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp1, tmp2, 0xee); 4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==4) 4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 15); 4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); 4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); 4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==5) 4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 31); 4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(first, first, 1); 4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(tmp, tmp, 1); 4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp, first, 0x88); 4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==6) 4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 63); 4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(first, first, 1); 4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(tmp, tmp, 1); 4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp, first, 0xcc); 4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==7) 5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(first, second, 127); 5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(first, first, 1); 5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_ps(tmp, tmp, 1); 5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_ps(tmp, first, 0xee); 5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<int Offset> 5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset,Packet4d> 5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) 5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang if (Offset==1) 5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_pd(first, second, 1); 5182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d tmp = _mm256_permute_pd(first, 5); 5192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_pd(tmp, tmp, 1); 5202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_pd(tmp, first, 0xA); 5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==2) 5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_pd(first, second, 3); 5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_pd(first, first, 1); 5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang else if (Offset==3) 5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang { 5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_pd(first, second, 7); 5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d tmp = _mm256_permute_pd(first, 5); 5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_permute2f128_pd(tmp, tmp, 1); 5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang first = _mm256_blend_pd(tmp, first, 5); 5332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 5352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void 5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8f,8>& kernel) { 5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); 5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); 5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); 5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); 5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); 5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); 5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); 5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); 5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); 5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); 5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); 5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); 5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); 5602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); 5612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); 5622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); 5632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void 5662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8f,4>& kernel) { 5672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 5682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 5692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 5702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 5712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 5732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 5742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 5752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 5762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); 5782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); 5792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); 5802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); 5812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void 5842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet4d,4>& kernel) { 5852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); 5862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); 5872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); 5882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); 5892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); 5912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); 5922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); 5932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); 5942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { 5972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const __m256 zero = _mm256_setzero_ps(); 5982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 5992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); 6002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); 6012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { 6032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const __m256d zero = _mm256_setzero_pd(); 6042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 6052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); 6062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); 6072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) 6102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 6112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blend_ps(a,pset1<Packet8f>(b),1); 6122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) 6152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 6162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blend_pd(a,pset1<Packet4d>(b),1); 6172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) 6202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 6212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7)); 6222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) 6252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 6262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3)); 6272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace internal 6302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace Eigen 6322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif // EIGEN_PACKET_MATH_AVX_H 634