12b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This file is part of Eigen, a lightweight C++ template library
22b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// for linear algebra.
32b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
42b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
52b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//
62b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This Source Code Form is subject to the terms of the Mozilla
72b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Public License v. 2.0. If a copy of the MPL was not distributed
82b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_PACKET_MATH_HALF_CUDA_H
122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen {
152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal {
162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Most of the following operations require arch >= 3.0
182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<half2> { enum { value = true }; };
212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<Eigen::half> : default_packet_traits
232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef half2 type;
252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef half2 half;
262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size=2,
302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 0,
312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAdd    = 1,
322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMul    = 1,
332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv    = 1,
342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt   = 1,
352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt  = 1,
362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp    = 1,
372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog    = 1,
382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog1p  = 1
392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __half2half2(from);
462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return *reinterpret_cast<const half2*>(from);
502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(from[0], from[1]);
542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(from[0], from[0]);
582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  *reinterpret_cast<half2*>(to) = from;
622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[0] = __low2half(from);
662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[1] = __high2half(from);
672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<>
702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 350
722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang   return __ldg((const half2*)from);
732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(*(from+0), *(from+1));
752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<>
792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 350
812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang   return __halves2half2(__ldg(from+0), __ldg(from+1));
822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(*(from+0), *(from+1));
842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(from[0*stride], from[1*stride]);
892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0] = __low2half(from);
932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1] = __high2half(from);
942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __low2half(a);
982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
1012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half2 result;
1022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = a.x & 0x7FFF7FFF;
1032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
1042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang__device__ EIGEN_STRONG_INLINE void
1082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<half2,2>& kernel) {
1092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half a1 = __low2half(kernel.packet[0]);
1102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half a2 = __high2half(kernel.packet[0]);
1112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half b1 = __low2half(kernel.packet[1]);
1122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half b2 = __high2half(kernel.packet[1]);
1132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = __halves2half2(a1, b1);
1142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = __halves2half2(a2, b2);
1152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
1182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
1202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f = __half2float(a) + 1.0f;
1222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(a, __float2half(f));
1232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
1272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hadd2(a, b);
1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
1332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
1342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = a1 + b1;
1352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = a2 + b2;
1362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
1372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
1412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hsub2(a, b);
1432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = a1 - b1;
1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = a2 - b2;
1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
1552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hneg2(a);
1572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(-a1, -a2);
1612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
1652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
1672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hmul2(a, b);
1692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
1732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
1742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = a1 * b1;
1752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = a2 * b2;
1762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
1772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
1812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
1822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang   return __hfma2(a, b, c);
1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
1842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
1872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
1882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float c1 = __low2float(c);
1892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float c2 = __high2float(c);
1902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = a1 * b1 + c1;
1912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = a2 * b2 + c2;
1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
1932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
1942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
1952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
1962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
1972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
1992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
2002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
2012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = a1 / b1;
2022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = a2 / b2;
2032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
2072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
2102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
2112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
2122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
2132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(r1, r2);
2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
2172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b1 = __low2float(b);
2202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float b2 = __high2float(b);
2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
2222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
2232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __halves2half2(r1, r2);
2242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
2272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
2282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hadd(__low2half(a), __high2half(a));
2292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
2332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
2372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half first = __low2half(a);
2392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half second = __high2half(a);
2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hgt(first, second) ? first : second;
2412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return a1 > a2 ? __low2half(a) : __high2half(a);
2452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
2502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half first = __low2half(a);
2512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __half second = __high2half(a);
2522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hlt(first, second) ? first : second;
2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return a1 < a2 ? __low2half(a) : __high2half(a);
2572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
2612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530
2622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __hmul(__low2half(a), __high2half(a));
2632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
2642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
2672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
2682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
2712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
2722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
2732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = log1pf(a1);
2742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = log1pf(a2);
2752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
2762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
2792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<>  __device__ EIGEN_STRONG_INLINE
2812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 plog<half2>(const half2& a) {
2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return h2log(a);
2832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE
2862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 pexp<half2>(const half2& a) {
2872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return h2exp(a);
2882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE
2912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 psqrt<half2>(const half2& a) {
2922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return h2sqrt(a);
2932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
2952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE
2962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 prsqrt<half2>(const half2& a) {
2972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return h2rsqrt(a);
2982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
2992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
3012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
3032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
3042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
3052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = logf(a1);
3062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = logf(a2);
3072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
3082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
3112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
3122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
3132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = expf(a1);
3142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = expf(a2);
3152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
3162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
3192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
3202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
3212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = sqrtf(a1);
3222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = sqrtf(a2);
3232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
3242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
3272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a1 = __low2float(a);
3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float a2 = __high2float(a);
3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r1 = rsqrtf(a1);
3302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float r2 = rsqrtf(a2);
3312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return __floats2half2_rn(r1, r2);
3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
3352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif defined EIGEN_VECTORIZE_AVX512
3372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct {
3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i x;
3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet16h;
3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet16h> { enum { value = true }; };
3442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
3462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<half> : default_packet_traits {
3472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet16h type;
3482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // There is no half-size packet for Packet16h.
3492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet16h half;
3502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
3512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
3532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size = 16,
3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 0,
3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAdd    = 0,
3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSub    = 0,
3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMul    = 0,
3582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasNegate = 0,
3592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs    = 0,
3602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs2   = 0,
3612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMin    = 0,
3622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMax    = 0,
3632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasConj   = 0,
3642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSetLinear = 0,
3652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv = 0,
3662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 0,
3672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 0,
3682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp = 0,
3692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog = 0,
3702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasBlend = 0
3712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
3722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
3732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
3762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
3782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
3792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_set1_epi16(from.x);
3802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
3842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
3852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
3882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
3892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
3902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
3912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
3942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
3952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
3962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
3972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
3982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
3992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
4002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm256_store_si256((__m256i*)to, from.x);
4012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
4042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm256_storeu_si256((__m256i*)to, from.x);
4052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h
4082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad(const Eigen::half* from) {
4092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
4102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short a = from[0].x;
4112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short b = from[1].x;
4122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short c = from[2].x;
4132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short d = from[3].x;
4142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
4152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
4162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
4192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C
4202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_cvtph_ps(a.x);
4212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
4222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half aux[16];
4232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, a);
4242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f0(aux[0]);
4252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f1(aux[1]);
4262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f2(aux[2]);
4272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f3(aux[3]);
4282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f4(aux[4]);
4292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f5(aux[5]);
4302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f6(aux[6]);
4312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f7(aux[7]);
4322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f8(aux[8]);
4332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f9(aux[9]);
4342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float fa(aux[10]);
4352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float fb(aux[11]);
4362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float fc(aux[12]);
4372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float fd(aux[13]);
4382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float fe(aux[14]);
4392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float ff(aux[15]);
4402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm512_set_ps(
4422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
4432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
4442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C
4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 float aux[16];
4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, a);
4542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h0(aux[0]);
4552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h1(aux[1]);
4562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h2(aux[2]);
4572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h3(aux[3]);
4582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h4(aux[4]);
4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h5(aux[5]);
4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h6(aux[6]);
4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h7(aux[7]);
4622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h8(aux[8]);
4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half h9(aux[9]);
4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half ha(aux[10]);
4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half hb(aux[11]);
4662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half hc(aux[12]);
4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half hd(aux[13]);
4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half he(aux[14]);
4692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  half hf(aux[15]);
4702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
4722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_set_epi16(
4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f af = half2float(a);
4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f bf = half2float(b);
4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f rf = padd(af, bf);
4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return float2half(rf);
4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f af = half2float(a);
4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f bf = half2float(b);
4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f rf = pmul(af, bf);
4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return float2half(rf);
4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16f from_float = half2float(from);
4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return half(predux(from_float));
4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet16h result;
5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_set_epi16(
5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half aux[16];
5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, from);
5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0].x = aux[0].x;
5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1].x = aux[1].x;
5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*2].x = aux[2].x;
5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*3].x = aux[3].x;
5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*4].x = aux[4].x;
5182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*5].x = aux[5].x;
5192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*6].x = aux[6].x;
5202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*7].x = aux[7].x;
5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*8].x = aux[8].x;
5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*9].x = aux[9].x;
5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*10].x = aux[10].x;
5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*11].x = aux[11].x;
5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*12].x = aux[12].x;
5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*13].x = aux[13].x;
5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*14].x = aux[14].x;
5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*15].x = aux[15].x;
5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,16>& kernel) {
5332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a = kernel.packet[0].x;
5342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i b = kernel.packet[1].x;
5352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i c = kernel.packet[2].x;
5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i d = kernel.packet[3].x;
5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i e = kernel.packet[4].x;
5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i f = kernel.packet[5].x;
5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i g = kernel.packet[6].x;
5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i h = kernel.packet[7].x;
5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i i = kernel.packet[8].x;
5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i j = kernel.packet[9].x;
5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i k = kernel.packet[10].x;
5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i l = kernel.packet[11].x;
5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i m = kernel.packet[12].x;
5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i n = kernel.packet[13].x;
5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i o = kernel.packet[14].x;
5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i p = kernel.packet[15].x;
5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
5602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
5612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
5622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
5632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
5642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
5652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
5662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
5672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
5692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
5702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
5712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
5722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
5732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
5742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
5752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
5762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
5782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
5792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
5802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
5812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
5822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
5832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
5842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
5852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
5862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
5872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
5882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
5892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
5902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
5912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
5922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
5932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
5942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
5952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
5962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
5972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
5982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
5992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
6002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
6012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
6022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
6042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
6052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
6062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
6072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
6082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
6092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
6102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
6112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
6122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
6132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
6142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
6152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
6162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
6172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
6182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
6192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
6202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0].x = a_p_0;
6222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1].x = a_p_1;
6232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2].x = a_p_2;
6242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3].x = a_p_3;
6252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[4].x = a_p_4;
6262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[5].x = a_p_5;
6272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[6].x = a_p_6;
6282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[7].x = a_p_7;
6292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[8].x = a_p_8;
6302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[9].x = a_p_9;
6312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[10].x = a_p_a;
6322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[11].x = a_p_b;
6332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[12].x = a_p_c;
6342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[13].x = a_p_d;
6352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[14].x = a_p_e;
6362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[15].x = a_p_f;
6372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
6402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,8>& kernel) {
6412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half in[8][16];
6422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[0], kernel.packet[0]);
6432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[1], kernel.packet[1]);
6442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[2], kernel.packet[2]);
6452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[3], kernel.packet[3]);
6462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[4], kernel.packet[4]);
6472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[5], kernel.packet[5]);
6482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[6], kernel.packet[6]);
6492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[7], kernel.packet[7]);
6502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half out[8][16];
6522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  for (int i = 0; i < 8; ++i) {
6542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 8; ++j) {
6552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j] = in[j][2*i];
6562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 8; ++j) {
6582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j+8] = in[j][2*i+1];
6592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
6612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = pload<Packet16h>(out[0]);
6632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = pload<Packet16h>(out[1]);
6642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = pload<Packet16h>(out[2]);
6652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = pload<Packet16h>(out[3]);
6662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[4] = pload<Packet16h>(out[4]);
6672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[5] = pload<Packet16h>(out[5]);
6682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[6] = pload<Packet16h>(out[6]);
6692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[7] = pload<Packet16h>(out[7]);
6702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
6712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
6732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,4>& kernel) {
6742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half in[4][16];
6752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[0], kernel.packet[0]);
6762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[1], kernel.packet[1]);
6772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[2], kernel.packet[2]);
6782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<half>(in[3], kernel.packet[3]);
6792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN64 half out[4][16];
6812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  for (int i = 0; i < 4; ++i) {
6832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
6842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j] = in[j][4*i];
6852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
6872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j+4] = in[j][4*i+1];
6882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
6902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j+8] = in[j][4*i+2];
6912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
6932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j+12] = in[j][4*i+3];
6942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
6952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
6962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
6972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = pload<Packet16h>(out[0]);
6982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = pload<Packet16h>(out[1]);
6992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = pload<Packet16h>(out[2]);
7002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = pload<Packet16h>(out[3]);
7012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif defined EIGEN_VECTORIZE_AVX
7052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct {
7072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i x;
7082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet8h;
7092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet8h> { enum { value = true }; };
7122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
7142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<Eigen::half> : default_packet_traits {
7152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8h type;
7162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // There is no half-size packet for Packet8h.
7172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet8h half;
7182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
7192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
7202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
7212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size = 8,
7222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 0,
7232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAdd    = 0,
7242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSub    = 0,
7252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMul    = 0,
7262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasNegate = 0,
7272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs    = 0,
7282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs2   = 0,
7292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMin    = 0,
7302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMax    = 0,
7312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasConj   = 0,
7322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSetLinear = 0,
7332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv = 0,
7342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 0,
7352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 0,
7362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp = 0,
7372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog = 0,
7382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasBlend = 0
7392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
7402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
7412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
7442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
7462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
7472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set1_epi16(from.x);
7482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
7492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
7522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
7532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
7562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
7572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
7582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
7592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
7622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
7632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
7642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
7652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
7682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
7692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
7722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
7732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h
7762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad<Packet8h>(const Eigen::half* from) {
7772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
7782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short a = from[0].x;
7792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  unsigned short b = from[1].x;
7802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
7812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
7822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
7832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
7852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C
7862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_cvtph_ps(a.x);
7872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
7882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN32 Eigen::half aux[8];
7892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, a);
7902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f0(aux[0]);
7912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f1(aux[1]);
7922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f2(aux[2]);
7932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f3(aux[3]);
7942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f4(aux[4]);
7952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f5(aux[5]);
7962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f6(aux[6]);
7972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float f7(aux[7]);
7982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
7992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
8002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
8012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
8042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C
8052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
8062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
8072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
8082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else
8092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN32 float aux[8];
8102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, a);
8112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h0(aux[0]);
8122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h1(aux[1]);
8132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h2(aux[2]);
8142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h3(aux[3]);
8152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h4(aux[4]);
8162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h5(aux[5]);
8172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h6(aux[6]);
8182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h7(aux[7]);
8192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
8212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
8222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
8232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
8242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
8272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
8292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f bf = half2float(b);
8312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f rf = padd(af, bf);
8322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return float2half(rf);
8332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
8362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f bf = half2float(b);
8382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f rf = pmul(af, bf);
8392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return float2half(rf);
8402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
8432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
8442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8h result;
8452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
8462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
8472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
8502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
8512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN32 Eigen::half aux[8];
8522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore(aux, from);
8532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0].x = aux[0].x;
8542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1].x = aux[1].x;
8552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*2].x = aux[2].x;
8562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*3].x = aux[3].x;
8572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*4].x = aux[4].x;
8582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*5].x = aux[5].x;
8592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*6].x = aux[6].x;
8602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*7].x = aux[7].x;
8612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
8642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float reduced = predux<Packet8f>(af);
8662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(reduced);
8672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
8702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float reduced = predux_max<Packet8f>(af);
8722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(reduced);
8732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
8762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float reduced = predux_min<Packet8f>(af);
8782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(reduced);
8792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
8822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet8f af = half2float(a);
8832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float reduced = predux_mul<Packet8f>(af);
8842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return Eigen::half(reduced);
8852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
8862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
8882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8h,8>& kernel) {
8892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a = kernel.packet[0].x;
8902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i b = kernel.packet[1].x;
8912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i c = kernel.packet[2].x;
8922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i d = kernel.packet[3].x;
8932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e = kernel.packet[4].x;
8942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i f = kernel.packet[5].x;
8952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i g = kernel.packet[6].x;
8962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i h = kernel.packet[7].x;
8972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
8982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
8992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
9002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
9012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
9022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
9032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
9042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
9052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
9062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
9082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
9092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
9102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
9112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
9122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
9132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
9142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
9152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
9172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
9182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
9192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
9202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
9212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
9222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
9232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
9242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
9262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
9272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
9282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
9292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
9302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
9312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
9322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
9332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
9362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8h,4>& kernel) {
9372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN32 Eigen::half in[4][8];
9382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<Eigen::half>(in[0], kernel.packet[0]);
9392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<Eigen::half>(in[1], kernel.packet[1]);
9402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<Eigen::half>(in[2], kernel.packet[2]);
9412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  pstore<Eigen::half>(in[3], kernel.packet[3]);
9422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  EIGEN_ALIGN32 Eigen::half out[4][8];
9442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  for (int i = 0; i < 4; ++i) {
9462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
9472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j] = in[j][2*i];
9482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
9492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int j = 0; j < 4; ++j) {
9502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      out[i][j+4] = in[j][2*i+1];
9512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    }
9522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  }
9532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0] = pload<Packet8h>(out[0]);
9552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1] = pload<Packet8h>(out[1]);
9562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2] = pload<Packet8h>(out[2]);
9572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3] = pload<Packet8h>(out[3]);
9582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
9592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Disable the following code since it's broken on too many platforms / compilers.
9622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
9632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif 0
9642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct {
9662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m64 x;
9672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet4h;
9682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet4h> { enum { value = true }; };
9712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
9722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <>
9732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<Eigen::half> : default_packet_traits {
9742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4h type;
9752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  // There is no half-size packet for Packet4h.
9762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  typedef Packet4h half;
9772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  enum {
9782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Vectorizable = 1,
9792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    AlignedOnScalar = 1,
9802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    size = 4,
9812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasHalfPacket = 0,
9822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAdd    = 0,
9832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSub    = 0,
9842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMul    = 0,
9852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasNegate = 0,
9862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs    = 0,
9872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasAbs2   = 0,
9882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMin    = 0,
9892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasMax    = 0,
9902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasConj   = 0,
9912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSetLinear = 0,
9922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasDiv = 0,
9932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasSqrt = 0,
9942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasRsqrt = 0,
9952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasExp = 0,
9962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasLog = 0,
9972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    HasBlend = 0
9982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  };
9992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang};
10002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
10032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
10052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set1_pi16(from.x);
10072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
10112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
10122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
10152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
10172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t a64 = _mm_cvtm64_si64(a.x);
10182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t b64 = _mm_cvtm64_si64(b.x);
10192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h[4];
10212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
10232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
10242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[0] = ha + hb;
10252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
10262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
10272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[1] = ha + hb;
10282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
10292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
10302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[2] = ha + hb;
10312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
10322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
10332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[3] = ha + hb;
10342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
10362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
10402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t a64 = _mm_cvtm64_si64(a.x);
10412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t b64 = _mm_cvtm64_si64(b.x);
10422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half h[4];
10442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
10462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
10472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[0] = ha * hb;
10482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
10492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
10502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[1] = ha * hb;
10512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
10522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
10532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[2] = ha * hb;
10542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
10552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
10562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  h[3] = ha * hb;
10572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
10592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
10632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
10652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
10692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
10712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
10752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t r = _mm_cvtm64_si64(from.x);
10762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  *(reinterpret_cast<__int64_t*>(to)) = r;
10772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
10802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t r = _mm_cvtm64_si64(from.x);
10812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  *(reinterpret_cast<__int64_t*>(to)) = r;
10822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h
10852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad<Packet4h>(const Eigen::half* from) {
10862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return pset1<Packet4h>(*from);
10872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
10902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
10912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet4h result;
10922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
10932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return result;
10942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
10952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
10962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
10972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
10982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __int64_t a = _mm_cvtm64_si64(from.x);
10992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*0].x = static_cast<unsigned short>(a);
11002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*1].x = static_cast<unsigned short>(a >> 16);
11012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*2].x = static_cast<unsigned short>(a >> 32);
11022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  to[stride*3].x = static_cast<unsigned short>(a >> 48);
11032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void
11062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet4h,4>& kernel) {
11072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
11082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
11092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
11102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
11112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
11132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
11142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
11152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
11162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif
11192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
11222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
11232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
1124