1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 27cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 37cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License"); 47cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFloweryou may not use this file except in compliance with the License. 57cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerYou may obtain a copy of the License at 67cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 77cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower http://www.apache.org/licenses/LICENSE-2.0 87cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 97cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software 107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS, 117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerSee the License for the specific language governing permissions and 137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerlimitations under the License. 147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower==============================================================================*/ 157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifndef TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_ 177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_ 187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#include "third_party/eigen3/Eigen/Core" 207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#include "tensorflow/core/platform/types.h" 217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 22818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#if defined(PLATFORM_WINDOWS) 231e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#include "tensorflow/core/platform/windows/cpu_info.h" 24818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#include "tensorflow/core/platform/windows/intrinsics_port.h" 25818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#endif 26818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan 277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowernamespace Eigen { 287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowernamespace internal { 297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return the float representation of the bfloat16 value 317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the lower 16-bits of input 327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pexpand_bf16_l(const Packet& from) { 3423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tensorflow::uint32 tmp; 351e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 3623caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000; 3723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner#else 3823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000; 391e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#endif 407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return reinterpret_cast<const float&>(tmp); 417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return the float representation of the bfloat16 value 447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the upper 16-bits of input 457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pexpand_bf16_u(const Packet& from) { 4723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tensorflow::uint32 tmp; 481e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 4923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000; 501e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#else 5123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000; 5223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner#endif 537cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return reinterpret_cast<const float&>(tmp); 547cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 5667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower// Specialization non-scalar version on non-sse. 5796e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm// Enable vectorization on z13 and higher 58579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \ 5996e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR) 6067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <typename Packet> 6167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) { 6267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower float r[4]; 6367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower tensorflow::uint32 p[4]; 6467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower pstoreu(r, from); 6523caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r); 6667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[0] = (ir[0] << 16) & 0xffff0000; 6723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner p[1] = ir[0] & 0xffff0000; 6867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[2] = (ir[1] << 16) & 0xffff0000; 6967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[3] = ir[1] & 0xffff0000; 7023caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return ploadu<Packet4f>(reinterpret_cast<float*>(p)); 7167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower} 7267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower 7367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <typename Packet> 7467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) { 7567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower float r[4]; 7667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower tensorflow::uint32 p[4]; 7767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower pstoreu(r, from); 7823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r); 7967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[0] = (ir[2] << 16) & 0xffff0000; 8067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[1] = ir[2] & 0xffff0000; 8167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[2] = (ir[3] << 16) & 0xffff0000; 8267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[3] = ir[3] & 0xffff0000; 8323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return ploadu<Packet4f>(reinterpret_cast<float*>(p)); 8467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower} 8567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower#endif 8667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower 877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pinterleave4x64(const Packet& from) { 897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return from; 907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_first(const Packet& a) { 947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return a; 957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_second(const Packet& a) { 997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower assert(false && "Not applicable to Scalar Values"); 1007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return a; 1017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 1047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_third(const Packet& a) { 1057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower assert(false && "Not applicable to Scalar Values"); 1067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return a; 1077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 1107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_fourth(const Packet& a) { 1117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower assert(false && "Not applicable to Scalar Values"); 1127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return a; 1137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 1167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pload4bf16( 1177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower const typename unpacket_traits<Packet>::type* from) { 1187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower assert(false && "Not applicable to Scalar Values"); 11967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower return Packet(); 1207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 1237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pload2bf16( 1247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower const typename unpacket_traits<Packet>::type* from) { 1257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower assert(false && "Not applicable to Scalar Values"); 12667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower return Packet(); 12767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower} 12867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower 129579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner// Specialization for pload4bf16 and pload2bf16 for non-sse. 13096e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm// Enable vectorization on z13 and higher. 131579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \ 13296e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR) 13367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <> 13467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) { 13567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower tensorflow::uint32 p[4]; 13623caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner const tensorflow::uint32* ir = 13723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner reinterpret_cast<const tensorflow::uint32*>(from); 13867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[0] = (ir[0] << 16) & 0xffff0000; 13923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner p[1] = ir[0] & 0xffff0000; 14067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[2] = (ir[1] << 16) & 0xffff0000; 14167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[3] = ir[1] & 0xffff0000; 14223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return ploadu<Packet4f>(reinterpret_cast<float*>(p)); 14367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower} 14467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower 14567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <> 14667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) { 14767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower tensorflow::uint32 p[4]; 14823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner const tensorflow::uint32* ir = 14923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner reinterpret_cast<const tensorflow::uint32*>(from); 15067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[0] = (ir[0] << 16) & 0xffff0000; 15123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner p[1] = ir[0] & 0xffff0000; 15267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[2] = (ir[0] << 16) & 0xffff0000; 15367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower p[3] = ir[0] & 0xffff0000; 15423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return ploadu<Packet4f>(reinterpret_cast<float*>(p)); 1557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 15667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower#endif 1577cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 158921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) 159921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the first value of the input Packet replicated 160921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <> 161921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) { 162982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen return vec_splat(a, 0); 163921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood} 164921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood 165921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the second value of the input Packet replicated 166921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <> 167921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) { 168982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen return vec_splat(a, 1); 169921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood} 170921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood 171921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the third value of the input Packet replicated 172921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <> 173921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) { 174982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen return vec_splat(a, 2); 175921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood} 176921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood 177921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the fourth value of the input Packet replicated 178921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <> 179921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) { 180982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen return vec_splat(a, 3); 181921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood} 182921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood#endif 183921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood 1847cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_SSE2 1857cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For PacketSize of 4 floats the Packet is not modified 1867cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 1877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pinterleave4x64<Packet4f>(const Packet4f& from) { 1887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return from; 1897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values 1927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 1937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) { 1947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 1957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from)); 1967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)); 1977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 1987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 1997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values 2007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 2017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) { 2027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 2037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castps_si128(_mm_load_ps1(from)); 2047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)); 2057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats expanded from 4 bfloat16 values 2087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the lower half of the 128-bit lane 2097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 2107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) { 2117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 2127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castps_si128(from); 2137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)); 2147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats expanded from 4 bfloat16 values 2177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the upper half of the 128-bit lane 2187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 2197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) { 2207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 2217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castps_si128(from); 2227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_castsi128_ps(_mm_unpackhi_epi16(zero, tmp)); 2237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the first value of the input Packet replicated 2267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 2277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) { 2287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_set1_ps(pfirst<Packet4f>(a)); 2297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the second value of the input Packet replicated 2327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 2337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) { 2347cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 1))); 2357cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2367cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2377cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the third value of the input Packet replicated 2387cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 2397cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) { 2407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 2))); 2417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the fourth value of the input Packet replicated 2447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 2457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) { 2467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 3))); 2477cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 2487cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 2497cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 2507cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 25135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512 25235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 25335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f 25435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_first<Packet16f>(const Packet16f& a_in) { 25535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4f a = _mm512_castps512_ps128(a_in); 25635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastss_ps(a); 25735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 25835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 25935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f 26035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_second<Packet16f>(const Packet16f& a_in) { 26135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4f a = _mm512_castps512_ps128(a_in); 26235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); 26335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 26435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 26535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f 26635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_third<Packet16f>(const Packet16f& a_in) { 26735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4f a = _mm512_castps512_ps128(a_in); 26835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); 26935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 27035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 27135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f 27235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_fourth<Packet16f>(const Packet16f& a_in) { 27335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4f a = _mm512_castps512_ps128(a_in); 27435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3))); 27535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 27635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 27735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) { 27835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet2d a = _mm512_castpd512_pd128(a_in); 27935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastsd_pd(a); 28035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 28135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 28235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) { 28335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3); 28435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastsd_pd(a); 28535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 28635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 28735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) { 28823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1); 28935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastsd_pd(a); 29035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 29135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 29235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) { 29323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner Packet2d a = 29423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3); 29535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastsd_pd(a); 29635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 29735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 29835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i 29935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_first<Packet16i>(const Packet16i& a_in) { 30035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4i a = _mm512_castsi512_si128(a_in); 30135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastd_epi32(a); 30235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 30335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 30435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i 30535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_second<Packet16i>(const Packet16i& a_in) { 30635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4i a = _mm512_castsi512_si128(a_in); 30735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1))); 30835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 30935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 31035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i 31135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_third<Packet16i>(const Packet16i& a_in) { 31235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4i a = _mm512_castsi512_si128(a_in); 31335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2))); 31435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 31535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 31635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i 31735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_fourth<Packet16i>(const Packet16i& a_in) { 31835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower Packet4i a = _mm512_castsi512_si128(a_in); 31935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3))); 32035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 32135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif 32235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 3237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX 3247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords 3257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 3267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) { 3277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2 3287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from), 3297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _MM_SHUFFLE(3, 1, 2, 0))); 3307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else 331db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2); 332db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3); 333db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4); 334db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5); 335db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4); 336db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5); 337db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2); 338db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3); 339db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower return _mm256_castsi256_ps(tmp5); 3407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 3417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 3427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values 3437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 3447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pload4bf16<Packet8f>(const float* from) { 3457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 3467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from)); 3477cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_castps128_ps256( 3487cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); 3497cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 3507cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values 3517cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 3527cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) { 3537cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 3547cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i tmp = _mm_castps_si128(_mm_load_ps1(from)); 3557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_castps128_ps256( 3567cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); 3577cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 3587cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 35935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512 36035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values 36135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 36235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) { 36335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 36435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from)); 36535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_castps128_ps512( 36635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); 36735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 36835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values 36935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <> 37035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) { 37135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 37235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower __m128i tmp = _mm_castps_si128(_mm_load_ps1(from)); 37335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower return _mm512_castps128_ps512( 37435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); 37535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 37635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif 37735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 3787cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half 3797cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// of the 128-bit lane 3807cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 3817cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_l(const Packet8f& from) { 3827cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2 3837cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256i zero = _mm256_setzero_si256(); 3847cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256i tmp = _mm256_castps_si256(from); 3857cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_castsi256_ps(_mm256_unpacklo_epi16(zero, tmp)); 3867cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else 3877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 3887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0)); 3897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i res_l = _mm_unpacklo_epi16(zero, low); 3907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1)); 3917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i res_h = _mm_unpacklo_epi16(zero, high); 3927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l)); 3937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1); 3947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return res; 3957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 3967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 3977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 3987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For each 128-bit lane convert 4 bfloat to 4 float values from the upper half 3997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// of the 128-bit lane 4007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet> 4017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_u(const Packet8f& from) { 4027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2 4037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256i zero = _mm256_setzero_si256(); 4047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256i tmp = _mm256_castps_si256(from); 4057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_castsi256_ps(_mm256_unpackhi_epi16(zero, tmp)); 4067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else 4077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i zero = _mm_setzero_si128(); 4087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0)); 4097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i res_l = _mm_unpackhi_epi16(zero, low); 4107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1)); 4117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m128i res_h = _mm_unpackhi_epi16(zero, high); 4127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l)); 4137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1); 4147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return res; 4157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 4167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 4177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 4187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the first value of the input Packet replicated 4197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 4207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_first<Packet8f>(const Packet8f& a) { 4217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_set1_ps(pfirst<Packet8f>(a)); 4227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 4237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 4247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the second value of the input Packet replicated 4257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 4267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_second<Packet8f>(const Packet8f& a) { 4277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_set1_ps( 4287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 1)))); 4297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 4307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 4317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the third value of the input Packet replicated 4327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 4337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_third<Packet8f>(const Packet8f& a) { 4347cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_set1_ps( 4357cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 2)))); 4367cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 4377cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 4387cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the fourth value of the input Packet replicated 4397cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <> 4407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) { 4417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower return _mm256_set1_ps( 4427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3)))); 4437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} 4447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower 4457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 44635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 44735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512 44835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 44935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <typename Packet> 45035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) { 45123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return _mm512_castsi512_ps(_mm512_slli_epi32( 45223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))), 45323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner 16)); 45435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 45535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 45635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <typename Packet> 45735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) { 45823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner Packet16i tmp = _mm512_castps_si512(from); 45923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner Packet16i tmp2 = _mm512_alignr_epi32(tmp, tmp, 8); 46023caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner return _mm512_castsi512_ps(_mm512_slli_epi32( 46123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner _mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp2)), 16)); 46235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower} 46335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower 46435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif 4657cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} // namespace internal 4667cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower} // namespace Eigen 4677cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif 468