12b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This file is part of Eigen, a lightweight C++ template library
22b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// for linear algebra.
32b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
42b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
52b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
62b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This Source Code Form is subject to the terms of the Mozilla
72b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Public License v. 2.0. If a copy of the MPL was not distributed
82b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_PACKET_MATH_AVX512_H
112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_PACKET_MATH_AVX512_H
122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen {
142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal {
162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__
262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m512 Packet16f;
322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m512i Packet16i;
332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef __m512d Packet8d;
342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct is_arithmetic<__m512> {
372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { value = true };
382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct is_arithmetic<__m512i> {
412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { value = true };
422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct is_arithmetic<__m512d> {
452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { value = true };
462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<float>  : default_packet_traits
492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet16f type;
512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8f half;
522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size = 16,
562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 1,
572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if EIGEN_GNUC_AT_LEAST(5, 3)
582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog = 1,
602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp = 1,
622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 1,
632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 1,
642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv = 1
662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang };
682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<double> : default_packet_traits
692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8d type;
712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4d half;
722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size = 8,
762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 1,
772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if EIGEN_GNUC_AT_LEAST(5, 3)
782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 1,
792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = EIGEN_FAST_MATH,
802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv = 1
822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang/* TODO Implement AVX512 for integers
862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<int>    : default_packet_traits
872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet16i type;
892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size=8
932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang*/
962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct unpacket_traits<Packet16f> {
992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef float type;
1002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8f half;
1012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { size = 16, alignment=Aligned64 };
1022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
1032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct unpacket_traits<Packet8d> {
1052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef double type;
1062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4d half;
1072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { size = 8, alignment=Aligned64 };
1082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
1092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct unpacket_traits<Packet16i> {
1112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef int type;
1122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8i half;
1132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum { size = 16, alignment=Aligned64 };
1142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
1152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
1182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_set1_ps(from);
1192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) {
1222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_set1_pd(from);
1232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
1262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_set1_epi32(from);
1272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
1312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_broadcastss_ps(_mm_load_ps1(from));
1322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
1352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_broadcastsd_pd(_mm_load_pd1(from));
1362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
1402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_add_ps(
1412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_set1_ps(a),
1422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
1432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                    4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
1442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_add_pd(_mm512_set1_pd(a),
1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                       _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
1542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_add_ps(a, b);
1552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
1582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
1592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_add_pd(a, b);
1602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
1642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
1652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_sub_ps(a, b);
1662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
1692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
1702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_sub_pd(a, b);
1712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
1752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
1762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
1792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
1802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
1842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return a;
1852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) {
1882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return a;
1892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return a;
1932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
1962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
1972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_mul_ps(a, b);
1992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
2022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
2032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_mul_pd(a, b);
2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
2082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_div_ps(a, b);
2102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
2132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_div_pd(a, b);
2152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef __FMA__
2182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
2202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                    const Packet16f& c) {
2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_fmadd_ps(a, b, c);
2222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
2252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                   const Packet8d& c) {
2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_fmadd_pd(a, b, c);
2272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
2322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
2332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_min_ps(a, b);
2342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
2372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_min_pd(a, b);
2392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
2432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
2442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_max_ps(a, b);
2452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_max_pd(a, b);
2502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
2542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
2552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
2562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_and_ps(a, b);
2572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
2592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
2602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
2612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
2622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
2642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
2652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
2662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
2682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
2722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
2732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
2742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
2762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
2802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
2812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_and_pd(a, b);
2832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d res = _mm512_undefined_pd();
2852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
2862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
2872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
2882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
2902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
2912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
2922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
2942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
2972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
2982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                             const Packet16f& b) {
2992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
3002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_or_ps(a, b);
3012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
3032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
3042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
3052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
3062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
3082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
3092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
3102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
3122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
3132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
3142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
3162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
3172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
3182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
3202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
3242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
3252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                           const Packet8d& b) {
3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
3272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_or_pd(a, b);
3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d res = _mm512_undefined_pd();
3302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
3312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
3352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
3372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
3432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
3442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                              const Packet16f& b) {
3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
3462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_xor_ps(a, b);
3472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
3492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
3502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
3512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
3582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
3592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
3602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
3622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
3632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
3642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
3662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
3692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
3702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                            const Packet8d& b) {
3712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
3722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_xor_pd(a, b);
3732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d res = _mm512_undefined_pd();
3752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
3762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
3772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
3782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
3802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
3822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
3842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
3882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
3892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                 const Packet16f& b) {
3902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
3912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_andnot_ps(a, b);
3922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
3942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
3952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
3962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
3972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
3992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
4002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
4012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
4032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
4042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
4052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
4072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
4082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
4092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
4112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
4122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
4152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                               const Packet8d& b) {
4162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
4172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_andnot_pd(a, b);
4182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
4192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d res = _mm512_undefined_pd();
4202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
4212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
4222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
4232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
4252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
4262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
4272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
4292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
4302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
4342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
4352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
4382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from);
4392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
4422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
4432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      reinterpret_cast<const __m512i*>(from));
4442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from);
4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
4562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
4572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      reinterpret_cast<const __m512i*>(from));
4582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 8 floats from memory a returns the packet
4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
4622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // mimic an "inplace" permutation of the lower 128bits using a blend
4662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane0 = _mm256_blend_ps(
4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      lane0, _mm256_castps128_ps256(_mm_permute_ps(
4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                 _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
4692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      15);
4702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // then we can perform a consistent permutation on the global register to get
4712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // everything in shape:
4722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // mimic an "inplace" permutation of the lower 128bits using a blend
4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane1 = _mm256_blend_ps(
4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      lane1, _mm256_castps128_ps256(_mm_permute_ps(
4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                 _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      15);
4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // then we can perform a consistent permutation on the global register to get
4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // everything in shape:
4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_insertf32x8(res, lane0, 0);
4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_insertf32x8(res, lane1, 1);
4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f res = _mm512_undefined_ps();
4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// a3}
5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane0 = _mm256_permute_pd(lane0, 3 << 2);
5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  lane1 = _mm256_permute_pd(lane1, 3 << 2);
5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d res = _mm512_undefined_pd();
5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm512_insertf64x4(res, lane0, 0);
5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_insertf64x4(res, lane1, 1);
5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 4 floats from memory a returns the packet
5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f tmp = _mm512_undefined_ps();
5182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
5192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
5202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return tmp;
5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Loads 2 doubles from memory a returns the packet
5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// {a0, a0  a0, a0, a1, a1, a1, a1}
5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d tmp = _mm512_undefined_pd();
5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet2d tmp0 = _mm_load_pd1(from);
5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet2d tmp1 = _mm_load_pd1(from + 1);
5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
5332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp = _mm512_insertf64x4(tmp, lane0, 0);
5342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_insertf64x4(tmp, lane1, 1);
5352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                from);
5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
5612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
5622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      reinterpret_cast<__m512i*>(to), from);
5632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
5672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                             Index stride) {
5682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i stride_vector = _mm512_set1_epi32(stride);
5692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i stride_multiplier =
5702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
5722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_i32gather_ps(indices, from, 4);
5742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
5772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                            Index stride) {
5782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i stride_vector = _mm256_set1_epi32(stride);
5792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
5802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
5812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_i32gather_pd(indices, from, 8);
5832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
5872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                         const Packet16f& from,
5882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                         Index stride) {
5892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i stride_vector = _mm512_set1_epi32(stride);
5902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i stride_multiplier =
5912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
5922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
5932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm512_i32scatter_ps(to, indices, from, 4);
5942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
5962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
5972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                         const Packet8d& from,
5982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                                         Index stride) {
5992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i stride_vector = _mm256_set1_epi32(stride);
6002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
6012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
6022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm512_i32scatter_pd(to, indices, from, 8);
6032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
6072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f pa = pset1<Packet16f>(a);
6082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
6092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
6122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8d pa = pset1<Packet8d>(a);
6132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
6142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
6172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16i pa = pset1<Packet16i>(a);
6182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(to, pa);
6192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
6222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
6232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
6242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
6272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
6282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
6312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
6322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
6352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
6362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
6392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
6412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
6442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
6462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
6492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // _mm512_abs_ps intrinsic not found, so hack around it
6512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
6522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
6542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
6552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // _mm512_abs_ps intrinsic not found, so hack around it
6562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return (__m512d)_mm512_and_si512((__m512i)a,
6572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                   _mm512_set1_epi64(0x7fffffffffffffff));
6582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
6612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
6622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
6632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
6642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_extractf32x8_ps(INPUT, 1)
6652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
6662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
6672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
6682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
6692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
6702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 OUTPUT##_1 = _mm256_insertf128_ps(                     \
6712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
6722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_extractf32x4_ps(INPUT, 3), 1);
6732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
6742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
6762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
6772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
6782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
6792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
6802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
6812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
6822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
6832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
6842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
6852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
6862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
6872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangvecs)
6882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
6892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
6902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
6912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
6922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
6932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
6942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
6952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
6962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
6972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
6982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
6992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
7002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
7012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
7022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
7032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
7042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
7052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
7072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
7082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
7092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
7102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
7122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
7132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
7142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
7152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
7172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
7182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
7192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
7202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 sum1 = _mm256_add_ps(perm1, hsum5);
7222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 sum2 = _mm256_add_ps(perm2, hsum6);
7232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 sum3 = _mm256_add_ps(perm3, hsum7);
7242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 sum4 = _mm256_add_ps(perm4, hsum8);
7252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
7272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
7282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
7302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
7322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
7332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
7342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
7352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
7372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
7382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
7392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
7402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
7422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
7432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
7442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
7452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum1 = _mm256_add_ps(perm1, hsum5);
7472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum2 = _mm256_add_ps(perm2, hsum6);
7482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum3 = _mm256_add_ps(perm3, hsum7);
7492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum4 = _mm256_add_ps(perm4, hsum8);
7502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
7522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
7532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
7552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
7572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
7582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
7592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
7602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
7622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
7632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
7642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
7652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
7672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
7682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
7692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
7702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum1 = _mm256_add_ps(perm1, hsum5);
7722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum2 = _mm256_add_ps(perm2, hsum6);
7732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum3 = _mm256_add_ps(perm3, hsum7);
7742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum4 = _mm256_add_ps(perm4, hsum8);
7752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
7772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
7782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
7802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
7822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
7832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
7842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
7852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
7872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
7882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
7892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
7902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
7922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
7932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
7942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
7952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum1 = _mm256_add_ps(perm1, hsum5);
7972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum2 = _mm256_add_ps(perm2, hsum6);
7982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum3 = _mm256_add_ps(perm3, hsum7);
7992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum4 = _mm256_add_ps(perm4, hsum8);
8002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
8022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
8032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
8052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 final_output;
8072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
8092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return final_output;
8102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
8132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
8142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
8152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
8162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
8182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
8192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
8212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
8222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
8242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
8252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
8272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
8282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
8302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
8312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
8332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
8342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
8362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
8372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp0, tmp1;
8392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
8412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
8422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
8442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
8452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
8472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
8492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
8502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
8522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
8532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
8552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
8572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
8582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
8602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
8612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
8632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
8652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
8662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
8682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
8692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
8712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
8732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_insertf64x4(final_output, final_1, 1);
8752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
8782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
8792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  //#ifdef EIGEN_VECTORIZE_AVX512DQ
8802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if 0
8812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
8822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
8832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f sum = padd(lane0, lane1);
8842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
8852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp0 = _mm256_hadd_ps(tmp0, tmp0);
8862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
8872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
8882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
8892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
8902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
8912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
8922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
8932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum = _mm_hadd_ps(sum, sum);
8942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
8952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(sum);
8962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
8972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
8992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
9002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
9012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
9022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d sum = padd(lane0, lane1);
9032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
9042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_hadd_pd(tmp0, tmp0));
9052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
9092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_VECTORIZE_AVX512DQ
9102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
9112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
9122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return padd(lane0, lane1);
9132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
9142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
9152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
9162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
9172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
9182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f sum0 = padd(lane0, lane2);
9192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f sum1 = padd(lane1, lane3);
9202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
9212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
9222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
9252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
9262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
9272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d res = padd(lane0, lane1);
9282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return res;
9292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
9332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//#ifdef EIGEN_VECTORIZE_AVX512DQ
9342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if 0
9352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
9362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
9372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f res = pmul(lane0, lane1);
9382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
9392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
9402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
9412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
9422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
9432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
9442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
9452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
9462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
9472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
9482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
9492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
9502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
9532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
9542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
9552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d res = pmul(lane0, lane1);
9562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
9572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
9582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
9622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
9632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
9642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
9652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
9662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
9672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
9682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
9692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
9722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
9732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
9742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d res = _mm256_min_pd(lane0, lane1);
9752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
9762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
9772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
9812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
9822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
9832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
9842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
9852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
9862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
9872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
9882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
9912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
9922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
9932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4d res = _mm256_max_pd(lane0, lane1);
9942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
9952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
9962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <int Offset>
9992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset, Packet16f> {
10002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  static EIGEN_STRONG_INLINE void run(Packet16f& first,
10012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                      const Packet16f& second) {
10022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    if (Offset != 0) {
10032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m512i first_idx = _mm512_set_epi32(
10042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
10052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
10062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
10072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m512i second_idx =
10092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
10102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
10112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
10122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
10132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      unsigned short mask = 0xFFFF;
10152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      mask <<= (16 - Offset);
10162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm512_permutexvar_ps(first_idx, first);
10182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
10192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm512_mask_blend_ps(mask, first, tmp);
10202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
10212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
10222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
10232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <int Offset>
10242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct palign_impl<Offset, Packet8d> {
10252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
10262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    if (Offset != 0) {
10272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m512i first_idx = _mm512_set_epi32(
10282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
10292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
10302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      __m512i second_idx = _mm512_set_epi32(
10322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
10332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
10342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      unsigned char mask = 0xFF;
10362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      mask <<= (8 - Offset);
10372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm512_permutexvar_pd(first_idx, first);
10392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
10402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      first = _mm512_mask_blend_pd(mask, first, tmp);
10412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
10422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
10432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
10442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
10472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
10482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
10502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
10512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
10522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
10532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
10542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
10552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
10562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
10572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
10582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
10592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
10602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
10612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
10622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
10632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
10642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
10652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
10662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
10672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
10682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
10692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
10702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
10712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
10722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
10732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
10742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
10752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
10762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
10772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
10782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
10792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
10802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
10812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
10822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
10842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
10852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
10862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
10872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
10882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
10892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
10902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
10912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
10922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
10932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
10942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
10952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
10962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
10972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
10982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
10992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PacketBlock<Packet8f, 32> tmp;
11012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
11032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
11042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
11052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
11062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
11072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
11082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
11092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
11102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
11122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
11132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
11142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
11152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
11162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
11172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
11182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
11192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // Second set of _m256 outputs
11212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
11222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
11232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
11242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
11252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
11262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
11272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
11282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
11292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
11312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
11322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
11332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
11342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
11352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
11362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
11372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
11382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // Pack them into the output
11402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
11412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
11422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
11432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
11442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
11462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
11472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
11482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
11492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
11512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
11522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
11532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
11542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
11562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
11572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
11582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
11592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)         \
11612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
11622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                           INPUT[2 * INDEX + STRIDE]);
11632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
11652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
11662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
11672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
11682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
11692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
11712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
11722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
11732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
11742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
11762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
11772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
11782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
11792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PacketBlock<Packet8f, 8> tmp;
11812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
11832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
11842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
11852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
11862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
11882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
11892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
11902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
11912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
11932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
11942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
11952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
11962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)                \
11992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
12002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
12012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)                         \
12032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
12042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  OUTPUT[INDEX] =                                                           \
12052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
12062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
12082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
12092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
12102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
12112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
12122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PacketBlock<Packet4d, 8> tmp;
12142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
12162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
12172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
12182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
12192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
12202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
12212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
12222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
12232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
12252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
12262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
12272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
12282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
12292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
12302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
12312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
12322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
12342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
12352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
12362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
12372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
12382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
12402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
12412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
12422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
12432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
12442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
12452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
12462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
12472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
12482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PacketBlock<Packet4d, 16> tmp;
12502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
12522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
12532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
12542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
12552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
12562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
12572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
12582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
12592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
12612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
12622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
12632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
12642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
12652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
12662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
12672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
12682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
12702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T6, 0), 0x20);
12712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
12722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                         _mm512_extractf64x4_pd(T7, 0), 0x20);
12732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
12742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T6, 0), 0x31);
12752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
12762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T7, 0), 0x31);
12772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
12792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T6, 1), 0x20);
12802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
12812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T7, 1), 0x20);
12822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
12832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T6, 1), 0x31);
12842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
12852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                          _mm512_extractf64x4_pd(T7, 1), 0x31);
12862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
12882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
12892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
12902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
12912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
12922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
12932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
12942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
12952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
12962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
12972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
12982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
12992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                     const Packet16f& /*thenPacket*/,
13002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                     const Packet16f& /*elsePacket*/) {
13012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  assert(false && "To be implemented");
13022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Packet16f();
13032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
13042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
13052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
13062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                    const Packet8d& /*thenPacket*/,
13072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang                                    const Packet8d& /*elsePacket*/) {
13082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  assert(false && "To be implemented");
13092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Packet8d();
13102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
13112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
13122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace internal
13132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
13142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace Eigen
13152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
13162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif // EIGEN_PACKET_MATH_AVX512_H
1317