12b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This file is part of Eigen, a lightweight C++ template library 22b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// for linear algebra. 32b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// 42b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> 52b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// 62b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// This Source Code Form is subject to the terms of the Mozilla 72b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Public License v. 2.0. If a copy of the MPL was not distributed 82b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H 112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#define EIGEN_PACKET_MATH_HALF_CUDA_H 122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen { 152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal { 162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Most of the following operations require arch >= 3.0 182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<half2> { enum { value = true }; }; 212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct packet_traits<Eigen::half> : default_packet_traits 232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef half2 type; 252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef half2 half; 262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size=2, 302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 0, 312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAdd = 1, 322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMul = 1, 332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 1, 342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 1, 352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 1, 362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 1, 372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog = 1, 382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog1p = 1 392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; 432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { 452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __half2half2(from); 462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { 492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return *reinterpret_cast<const half2*>(from); 502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { 532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(from[0], from[1]); 542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) { 572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(from[0], from[0]); 582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { 612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang *reinterpret_cast<half2*>(to) = from; 622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { 652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[0] = __low2half(from); 662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[1] = __high2half(from); 672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> 702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { 712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 350 722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __ldg((const half2*)from); 732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(*(from+0), *(from+1)); 752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> 792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { 802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 350 812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(__ldg(from+0), __ldg(from+1)); 822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(*(from+0), *(from+1)); 842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { 882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(from[0*stride], from[1*stride]); 892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { 922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0] = __low2half(from); 932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1] = __high2half(from); 942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { 972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __low2half(a); 982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { 1012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half2 result; 1022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = a.x & 0x7FFF7FFF; 1032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 1042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang__device__ EIGEN_STRONG_INLINE void 1082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<half2,2>& kernel) { 1092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half a1 = __low2half(kernel.packet[0]); 1102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half a2 = __high2half(kernel.packet[0]); 1112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half b1 = __low2half(kernel.packet[1]); 1122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half b2 = __high2half(kernel.packet[1]); 1132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = __halves2half2(a1, b1); 1142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = __halves2half2(a2, b2); 1152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { 1182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(a, __hadd(a, __float2half(1.0f))); 1202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f = __half2float(a) + 1.0f; 1222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(a, __float2half(f)); 1232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { 1272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hadd2(a, b); 1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 1332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 1342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = a1 + b1; 1352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = a2 + b2; 1362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 1372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { 1412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hsub2(a, b); 1432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = a1 - b1; 1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = a2 - b2; 1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { 1552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hneg2(a); 1572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(-a1, -a2); 1612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } 1652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { 1672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hmul2(a, b); 1692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 1732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 1742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = a1 * b1; 1752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = a2 * b2; 1762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 1772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { 1812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 1822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hfma2(a, b, c); 1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 1842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 1872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 1882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float c1 = __low2float(c); 1892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float c2 = __high2float(c); 1902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = a1 * b1 + c1; 1912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = a2 * b2 + c2; 1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 1932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 1942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 1952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 1962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { 1972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 1992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 2002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 2012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = a1 / b1; 2022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = a2 / b2; 2032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { 2072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 2102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 2112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); 2122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); 2132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(r1, r2); 2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { 2172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b1 = __low2float(b); 2202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float b2 = __high2float(b); 2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); 2222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); 2232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __halves2half2(r1, r2); 2242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { 2272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 2282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hadd(__low2half(a), __high2half(a)); 2292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); 2332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 2342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { 2372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half first = __low2half(a); 2392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half second = __high2half(a); 2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hgt(first, second) ? first : second; 2412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return a1 > a2 ? __low2half(a) : __high2half(a); 2452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 2462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { 2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 2502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half first = __low2half(a); 2512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __half second = __high2half(a); 2522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hlt(first, second) ? first : second; 2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 2542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return a1 < a2 ? __low2half(a) : __high2half(a); 2572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 2582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { 2612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if __CUDA_ARCH__ >= 530 2622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __hmul(__low2half(a), __high2half(a)); 2632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 2642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); 2672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 2682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { 2712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 2722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 2732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = log1pf(a1); 2742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = log1pf(a2); 2752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 2762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 2792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE 2812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 plog<half2>(const half2& a) { 2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return h2log(a); 2832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE 2862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 pexp<half2>(const half2& a) { 2872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return h2exp(a); 2882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE 2912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 psqrt<half2>(const half2& a) { 2922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return h2sqrt(a); 2932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 2952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE 2962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wanghalf2 prsqrt<half2>(const half2& a) { 2972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return h2rsqrt(a); 2982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 2992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 3012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { 3032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 3042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 3052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = logf(a1); 3062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = logf(a2); 3072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 3082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) { 3112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 3122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 3132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = expf(a1); 3142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = expf(a2); 3152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 3162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) { 3192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 3202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 3212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = sqrtf(a1); 3222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = sqrtf(a2); 3232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 3242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) { 3272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a1 = __low2float(a); 3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float a2 = __high2float(a); 3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r1 = rsqrtf(a1); 3302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float r2 = rsqrtf(a2); 3312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return __floats2half2_rn(r1, r2); 3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 3352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif defined EIGEN_VECTORIZE_AVX512 3372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct { 3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i x; 3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet16h; 3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet16h> { enum { value = true }; }; 3442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <> 3462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<half> : default_packet_traits { 3472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet16h type; 3482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // There is no half-size packet for Packet16h. 3492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet16h half; 3502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 3512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 3532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size = 16, 3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 0, 3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAdd = 0, 3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSub = 0, 3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMul = 0, 3582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasNegate = 0, 3592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs = 0, 3602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs2 = 0, 3612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMin = 0, 3622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMax = 0, 3632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasConj = 0, 3642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSetLinear = 0, 3652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 0, 3662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 0, 3672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 0, 3682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 0, 3692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog = 0, 3702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasBlend = 0 3712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 3722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 3732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; 3762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { 3782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 3792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_set1_epi16(from.x); 3802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) { 3842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0))); 3852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) { 3882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 3892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); 3902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 3912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) { 3942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 3952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); 3962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 3972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 3982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 3992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { 4002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm256_store_si256((__m256i*)to, from.x); 4012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { 4042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm256_storeu_si256((__m256i*)to, from.x); 4052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h 4082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad(const Eigen::half* from) { 4092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 4102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short a = from[0].x; 4112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short b = from[1].x; 4122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short c = from[2].x; 4132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short d = from[3].x; 4142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); 4152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 4162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { 4192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C 4202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm512_cvtph_ps(a.x); 4212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 4222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half aux[16]; 4232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, a); 4242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f0(aux[0]); 4252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f1(aux[1]); 4262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f2(aux[2]); 4272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f3(aux[3]); 4282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f4(aux[4]); 4292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f5(aux[5]); 4302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f6(aux[6]); 4312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f7(aux[7]); 4322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f8(aux[8]); 4332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f9(aux[9]); 4342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float fa(aux[10]); 4352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float fb(aux[11]); 4362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float fc(aux[12]); 4372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float fd(aux[13]); 4382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float fe(aux[14]); 4392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float ff(aux[15]); 4402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm512_set_ps( 4422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); 4432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 4442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { 4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C 4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); 4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 float aux[16]; 4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, a); 4542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h0(aux[0]); 4552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h1(aux[1]); 4562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h2(aux[2]); 4572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h3(aux[3]); 4582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h4(aux[4]); 4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h5(aux[5]); 4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h6(aux[6]); 4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h7(aux[7]); 4622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h8(aux[8]); 4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half h9(aux[9]); 4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half ha(aux[10]); 4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half hb(aux[11]); 4662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half hc(aux[12]); 4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half hd(aux[13]); 4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half he(aux[14]); 4692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang half hf(aux[15]); 4702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 4722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_set_epi16( 4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, 4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); 4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { 4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f af = half2float(a); 4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f bf = half2float(b); 4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f rf = padd(af, bf); 4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return float2half(rf); 4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { 4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f af = half2float(a); 4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f bf = half2float(b); 4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f rf = pmul(af, bf); 4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return float2half(rf); 4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { 4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16f from_float = half2float(from); 4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return half(predux(from_float)); 4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) 4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet16h result; 5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_set_epi16( 5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, 5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, 5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, 5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) 5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half aux[16]; 5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, from); 5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0].x = aux[0].x; 5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1].x = aux[1].x; 5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*2].x = aux[2].x; 5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*3].x = aux[3].x; 5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*4].x = aux[4].x; 5182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*5].x = aux[5].x; 5192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*6].x = aux[6].x; 5202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*7].x = aux[7].x; 5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*8].x = aux[8].x; 5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*9].x = aux[9].x; 5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*10].x = aux[10].x; 5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*11].x = aux[11].x; 5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*12].x = aux[12].x; 5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*13].x = aux[13].x; 5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*14].x = aux[14].x; 5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*15].x = aux[15].x; 5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,16>& kernel) { 5332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a = kernel.packet[0].x; 5342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i b = kernel.packet[1].x; 5352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i c = kernel.packet[2].x; 5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i d = kernel.packet[3].x; 5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i e = kernel.packet[4].x; 5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i f = kernel.packet[5].x; 5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i g = kernel.packet[6].x; 5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i h = kernel.packet[7].x; 5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i i = kernel.packet[8].x; 5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i j = kernel.packet[9].x; 5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i k = kernel.packet[10].x; 5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i l = kernel.packet[11].x; 5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i m = kernel.packet[12].x; 5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i n = kernel.packet[13].x; 5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i o = kernel.packet[14].x; 5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i p = kernel.packet[15].x; 5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ab_07 = _mm256_unpacklo_epi16(a, b); 5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i cd_07 = _mm256_unpacklo_epi16(c, d); 5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ef_07 = _mm256_unpacklo_epi16(e, f); 5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i gh_07 = _mm256_unpacklo_epi16(g, h); 5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ij_07 = _mm256_unpacklo_epi16(i, j); 5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i kl_07 = _mm256_unpacklo_epi16(k, l); 5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mn_07 = _mm256_unpacklo_epi16(m, n); 5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i op_07 = _mm256_unpacklo_epi16(o, p); 5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ab_8f = _mm256_unpackhi_epi16(a, b); 5602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i cd_8f = _mm256_unpackhi_epi16(c, d); 5612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ef_8f = _mm256_unpackhi_epi16(e, f); 5622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i gh_8f = _mm256_unpackhi_epi16(g, h); 5632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ij_8f = _mm256_unpackhi_epi16(i, j); 5642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i kl_8f = _mm256_unpackhi_epi16(k, l); 5652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mn_8f = _mm256_unpackhi_epi16(m, n); 5662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i op_8f = _mm256_unpackhi_epi16(o, p); 5672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); 5692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); 5702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); 5712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); 5722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); 5732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); 5742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); 5752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); 5762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); 5782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); 5792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); 5802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); 5812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); 5822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); 5832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); 5842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); 5852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); 5872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); 5882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); 5892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); 5902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); 5912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); 5922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); 5932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); 5942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); 5952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); 5962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); 5972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); 5982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); 5992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); 6002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); 6012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); 6022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // NOTE: no unpacklo/hi instr in this case, so using permute instr. 6042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); 6052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); 6062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); 6072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); 6082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); 6092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); 6102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); 6112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); 6122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); 6132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); 6142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); 6152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); 6162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); 6172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); 6182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); 6192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); 6202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0].x = a_p_0; 6222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1].x = a_p_1; 6232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2].x = a_p_2; 6242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3].x = a_p_3; 6252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[4].x = a_p_4; 6262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[5].x = a_p_5; 6272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[6].x = a_p_6; 6282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[7].x = a_p_7; 6292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[8].x = a_p_8; 6302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[9].x = a_p_9; 6312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[10].x = a_p_a; 6322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[11].x = a_p_b; 6332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[12].x = a_p_c; 6342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[13].x = a_p_d; 6352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[14].x = a_p_e; 6362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[15].x = a_p_f; 6372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 6402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,8>& kernel) { 6412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half in[8][16]; 6422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[0], kernel.packet[0]); 6432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[1], kernel.packet[1]); 6442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[2], kernel.packet[2]); 6452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[3], kernel.packet[3]); 6462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[4], kernel.packet[4]); 6472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[5], kernel.packet[5]); 6482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[6], kernel.packet[6]); 6492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[7], kernel.packet[7]); 6502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half out[8][16]; 6522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int i = 0; i < 8; ++i) { 6542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 8; ++j) { 6552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j] = in[j][2*i]; 6562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 8; ++j) { 6582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j+8] = in[j][2*i+1]; 6592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = pload<Packet16h>(out[0]); 6632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = pload<Packet16h>(out[1]); 6642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = pload<Packet16h>(out[2]); 6652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = pload<Packet16h>(out[3]); 6662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[4] = pload<Packet16h>(out[4]); 6672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[5] = pload<Packet16h>(out[5]); 6682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[6] = pload<Packet16h>(out[6]); 6692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[7] = pload<Packet16h>(out[7]); 6702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 6712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 6732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet16h,4>& kernel) { 6742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half in[4][16]; 6752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[0], kernel.packet[0]); 6762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[1], kernel.packet[1]); 6772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[2], kernel.packet[2]); 6782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<half>(in[3], kernel.packet[3]); 6792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN64 half out[4][16]; 6812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int i = 0; i < 4; ++i) { 6832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 6842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j] = in[j][4*i]; 6852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 6872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j+4] = in[j][4*i+1]; 6882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 6902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j+8] = in[j][4*i+2]; 6912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 6932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j+12] = in[j][4*i+3]; 6942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 6962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 6972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = pload<Packet16h>(out[0]); 6982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = pload<Packet16h>(out[1]); 6992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = pload<Packet16h>(out[2]); 7002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = pload<Packet16h>(out[3]); 7012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif defined EIGEN_VECTORIZE_AVX 7052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct { 7072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i x; 7082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet8h; 7092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet8h> { enum { value = true }; }; 7122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <> 7142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<Eigen::half> : default_packet_traits { 7152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet8h type; 7162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // There is no half-size packet for Packet8h. 7172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet8h half; 7182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 7192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 7202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 7212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size = 8, 7222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 0, 7232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAdd = 0, 7242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSub = 0, 7252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMul = 0, 7262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasNegate = 0, 7272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs = 0, 7282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs2 = 0, 7292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMin = 0, 7302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMax = 0, 7312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasConj = 0, 7322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSetLinear = 0, 7332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 0, 7342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 0, 7352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 0, 7362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 0, 7372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog = 0, 7382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasBlend = 0 7392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 7402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 7412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; 7442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { 7462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 7472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set1_epi16(from.x); 7482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 7492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) { 7522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0))); 7532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) { 7562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 7572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from)); 7582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 7592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) { 7622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 7632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); 7642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 7652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) { 7682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); 7692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) { 7722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); 7732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h 7762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad<Packet8h>(const Eigen::half* from) { 7772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 7782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short a = from[0].x; 7792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang unsigned short b = from[1].x; 7802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); 7812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 7822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 7832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { 7852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C 7862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_cvtph_ps(a.x); 7872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 7882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN32 Eigen::half aux[8]; 7892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, a); 7902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f0(aux[0]); 7912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f1(aux[1]); 7922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f2(aux[2]); 7932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f3(aux[3]); 7942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f4(aux[4]); 7952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f5(aux[5]); 7962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f6(aux[6]); 7972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float f7(aux[7]); 7982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 7992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); 8002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 8012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { 8042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#ifdef EIGEN_HAS_FP16_C 8052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 8062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); 8072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 8082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 8092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN32 float aux[8]; 8102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, a); 8112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h0(aux[0]); 8122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h1(aux[1]); 8132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h2(aux[2]); 8142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h3(aux[3]); 8152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h4(aux[4]); 8162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h5(aux[5]); 8172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h6(aux[6]); 8182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h7(aux[7]); 8192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 8212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); 8222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 8232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 8242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } 8272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { 8292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f bf = half2float(b); 8312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f rf = padd(af, bf); 8322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return float2half(rf); 8332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { 8362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f bf = half2float(b); 8382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f rf = pmul(af, bf); 8392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return float2half(rf); 8402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) 8432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 8442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8h result; 8452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 8462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 8472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) 8502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 8512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN32 Eigen::half aux[8]; 8522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore(aux, from); 8532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0].x = aux[0].x; 8542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1].x = aux[1].x; 8552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*2].x = aux[2].x; 8562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*3].x = aux[3].x; 8572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*4].x = aux[4].x; 8582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*5].x = aux[5].x; 8592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*6].x = aux[6].x; 8602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*7].x = aux[7].x; 8612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) { 8642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float reduced = predux<Packet8f>(af); 8662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(reduced); 8672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) { 8702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float reduced = predux_max<Packet8f>(af); 8722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(reduced); 8732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) { 8762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float reduced = predux_min<Packet8f>(af); 8782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(reduced); 8792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) { 8822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet8f af = half2float(a); 8832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang float reduced = predux_mul<Packet8f>(af); 8842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return Eigen::half(reduced); 8852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 8862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 8882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8h,8>& kernel) { 8892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a = kernel.packet[0].x; 8902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i b = kernel.packet[1].x; 8912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i c = kernel.packet[2].x; 8922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i d = kernel.packet[3].x; 8932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e = kernel.packet[4].x; 8942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i f = kernel.packet[5].x; 8952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i g = kernel.packet[6].x; 8962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i h = kernel.packet[7].x; 8972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 8982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a03b03 = _mm_unpacklo_epi16(a, b); 8992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i c03d03 = _mm_unpacklo_epi16(c, d); 9002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e03f03 = _mm_unpacklo_epi16(e, f); 9012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i g03h03 = _mm_unpacklo_epi16(g, h); 9022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a47b47 = _mm_unpackhi_epi16(a, b); 9032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i c47d47 = _mm_unpackhi_epi16(c, d); 9042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e47f47 = _mm_unpackhi_epi16(e, f); 9052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i g47h47 = _mm_unpackhi_epi16(g, h); 9062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); 9082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); 9092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); 9102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); 9112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); 9122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); 9132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); 9142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); 9152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); 9172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); 9182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); 9192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); 9202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); 9212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); 9222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); 9232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); 9242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0].x = a0b0c0d0e0f0g0h0; 9262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1].x = a1b1c1d1e1f1g1h1; 9272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2].x = a2b2c2d2e2f2g2h2; 9282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3].x = a3b3c3d3e3f3g3h3; 9292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[4].x = a4b4c4d4e4f4g4h4; 9302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[5].x = a5b5c5d5e5f5g5h5; 9312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[6].x = a6b6c6d6e6f6g6h6; 9322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[7].x = a7b7c7d7e7f7g7h7; 9332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 9342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 9362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet8h,4>& kernel) { 9372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN32 Eigen::half in[4][8]; 9382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<Eigen::half>(in[0], kernel.packet[0]); 9392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<Eigen::half>(in[1], kernel.packet[1]); 9402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<Eigen::half>(in[2], kernel.packet[2]); 9412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang pstore<Eigen::half>(in[3], kernel.packet[3]); 9422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang EIGEN_ALIGN32 Eigen::half out[4][8]; 9442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int i = 0; i < 4; ++i) { 9462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 9472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j] = in[j][2*i]; 9482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 9492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang for (int j = 0; j < 4; ++j) { 9502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang out[i][j+4] = in[j][2*i+1]; 9512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 9522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang } 9532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0] = pload<Packet8h>(out[0]); 9552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1] = pload<Packet8h>(out[1]); 9562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2] = pload<Packet8h>(out[2]); 9572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3] = pload<Packet8h>(out[3]); 9582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 9592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Disable the following code since it's broken on too many platforms / compilers. 9622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) 9632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#elif 0 9642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtypedef struct { 9662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m64 x; 9672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} Packet4h; 9682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct is_arithmetic<Packet4h> { enum { value = true }; }; 9712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 9722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <> 9732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangstruct packet_traits<Eigen::half> : default_packet_traits { 9742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet4h type; 9752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // There is no half-size packet for Packet4h. 9762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang typedef Packet4h half; 9772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang enum { 9782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Vectorizable = 1, 9792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang AlignedOnScalar = 1, 9802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang size = 4, 9812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasHalfPacket = 0, 9822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAdd = 0, 9832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSub = 0, 9842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMul = 0, 9852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasNegate = 0, 9862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs = 0, 9872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasAbs2 = 0, 9882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMin = 0, 9892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasMax = 0, 9902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasConj = 0, 9912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSetLinear = 0, 9922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasDiv = 0, 9932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasSqrt = 0, 9942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasRsqrt = 0, 9952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasExp = 0, 9962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasLog = 0, 9972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang HasBlend = 0 9982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang }; 9992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}; 10002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; 10032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { 10052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set1_pi16(from.x); 10072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) { 10112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x))); 10122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } 10152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) { 10172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t a64 = _mm_cvtm64_si64(a.x); 10182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t b64 = _mm_cvtm64_si64(b.x); 10192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h[4]; 10212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); 10232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); 10242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[0] = ha + hb; 10252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); 10262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); 10272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[1] = ha + hb; 10282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); 10292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); 10302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[2] = ha + hb; 10312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); 10322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); 10332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[3] = ha + hb; 10342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); 10362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) { 10402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t a64 = _mm_cvtm64_si64(a.x); 10412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t b64 = _mm_cvtm64_si64(b.x); 10422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half h[4]; 10442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); 10462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); 10472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[0] = ha * hb; 10482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); 10492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); 10502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[1] = ha * hb; 10512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); 10522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); 10532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[2] = ha * hb; 10542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); 10552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); 10562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang h[3] = ha * hb; 10572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); 10592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10622b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) { 10632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); 10652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) { 10692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); 10712b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10722b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) { 10752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t r = _mm_cvtm64_si64(from.x); 10762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang *(reinterpret_cast<__int64_t*>(to)) = r; 10772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) { 10802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t r = _mm_cvtm64_si64(from.x); 10812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang *(reinterpret_cast<__int64_t*>(to)) = r; 10822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h 10852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangploadquad<Packet4h>(const Eigen::half* from) { 10862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return pset1<Packet4h>(*from); 10872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride) 10902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 10912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4h result; 10922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 10932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return result; 10942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 10952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 10962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride) 10972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 10982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __int64_t a = _mm_cvtm64_si64(from.x); 10992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*0].x = static_cast<unsigned short>(a); 11002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*1].x = static_cast<unsigned short>(a >> 16); 11012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*2].x = static_cast<unsigned short>(a >> 32); 11022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang to[stride*3].x = static_cast<unsigned short>(a >> 48); 11032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 11042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 11052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_STRONG_INLINE void 11062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptranspose(PacketBlock<Packet4h,4>& kernel) { 11072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); 11082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); 11092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); 11102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); 11112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 11122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); 11132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); 11142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); 11152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); 11162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 11172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 11182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 11192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 11202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 11212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 11222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 11232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif // EIGEN_PACKET_MATH_HALF_CUDA_H 1124