1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
27cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
37cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License");
47cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFloweryou may not use this file except in compliance with the License.
57cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerYou may obtain a copy of the License at
67cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
77cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower    http://www.apache.org/licenses/LICENSE-2.0
87cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
97cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software
107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS,
117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerSee the License for the specific language governing permissions and
137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerlimitations under the License.
147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower==============================================================================*/
157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifndef TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#include "third_party/eigen3/Eigen/Core"
207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#include "tensorflow/core/platform/types.h"
217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
22818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#if defined(PLATFORM_WINDOWS)
231e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#include "tensorflow/core/platform/windows/cpu_info.h"
24818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#include "tensorflow/core/platform/windows/intrinsics_port.h"
25818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan#endif
26818993c7751601527d662d2417f220e4e856e4efVijay Vasudevan
277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowernamespace Eigen {
287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowernamespace internal {
297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return the float representation of the bfloat16 value
317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the lower 16-bits of input
327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pexpand_bf16_l(const Packet& from) {
3423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tensorflow::uint32 tmp;
351e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
3623caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
3723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner#else
3823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
391e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#endif
407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return reinterpret_cast<const float&>(tmp);
417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return the float representation of the bfloat16 value
447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the upper 16-bits of input
457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pexpand_bf16_u(const Packet& from) {
4723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tensorflow::uint32 tmp;
481e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
4923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
501e44b15ff02fe3bf4764189eb1b796602d669c3enamrata-ibm#else
5123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
5223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner#endif
537cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return reinterpret_cast<const float&>(tmp);
547cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
5667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower// Specialization non-scalar version on non-sse.
5796e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm// Enable vectorization on z13 and higher
58579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \
5996e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm    defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR)
6067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <typename Packet>
6167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
6267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  float r[4];
6367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  tensorflow::uint32 p[4];
6467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  pstoreu(r, from);
6523caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
6667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[0] = (ir[0] << 16) & 0xffff0000;
6723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  p[1] = ir[0] & 0xffff0000;
6867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[2] = (ir[1] << 16) & 0xffff0000;
6967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[3] = ir[1] & 0xffff0000;
7023caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
7167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower}
7267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower
7367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <typename Packet>
7467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
7567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  float r[4];
7667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  tensorflow::uint32 p[4];
7767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  pstoreu(r, from);
7823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
7967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[0] = (ir[2] << 16) & 0xffff0000;
8067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[1] = ir[2] & 0xffff0000;
8167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[2] = (ir[3] << 16) & 0xffff0000;
8267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[3] = ir[3] & 0xffff0000;
8323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
8467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower}
8567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower#endif
8667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower
877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pinterleave4x64(const Packet& from) {
897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return from;
907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_first(const Packet& a) {
947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return a;
957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_second(const Packet& a) {
997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  assert(false && "Not applicable to Scalar Values");
1007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return a;
1017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
1047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_third(const Packet& a) {
1057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  assert(false && "Not applicable to Scalar Values");
1067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return a;
1077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
1107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pbroadcast_fourth(const Packet& a) {
1117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  assert(false && "Not applicable to Scalar Values");
1127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return a;
1137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
1167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pload4bf16(
1177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower    const typename unpacket_traits<Packet>::type* from) {
1187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  assert(false && "Not applicable to Scalar Values");
11967324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  return Packet();
1207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
1237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet pload2bf16(
1247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower    const typename unpacket_traits<Packet>::type* from) {
1257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  assert(false && "Not applicable to Scalar Values");
12667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  return Packet();
12767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower}
12867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower
129579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner// Specialization for pload4bf16 and pload2bf16 for non-sse.
13096e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm// Enable vectorization on z13 and higher.
131579438b835aafc377bba01ef6c50beca896a56bbBenoit Steiner#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \
13296e9e8e8fc8d027021225fc8bd665968859c8d01namrata-ibm    defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR)
13367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <>
13467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
13567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  tensorflow::uint32 p[4];
13623caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  const tensorflow::uint32* ir =
13723caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      reinterpret_cast<const tensorflow::uint32*>(from);
13867324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[0] = (ir[0] << 16) & 0xffff0000;
13923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  p[1] = ir[0] & 0xffff0000;
14067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[2] = (ir[1] << 16) & 0xffff0000;
14167324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[3] = ir[1] & 0xffff0000;
14223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
14367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower}
14467324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower
14567324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowertemplate <>
14667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
14767324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  tensorflow::uint32 p[4];
14823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  const tensorflow::uint32* ir =
14923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      reinterpret_cast<const tensorflow::uint32*>(from);
15067324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[0] = (ir[0] << 16) & 0xffff0000;
15123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  p[1] = ir[0] & 0xffff0000;
15267324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[2] = (ir[0] << 16) & 0xffff0000;
15367324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower  p[3] = ir[0] & 0xffff0000;
15423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
1557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
15667324b1e3af826c4c491802f4022a5f5be9f6670A. Unique TensorFlower#endif
1577cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
158921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
159921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the first value of the input Packet replicated
160921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <>
161921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
162982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen  return vec_splat(a, 0);
163921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood}
164921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood
165921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the second value of the input Packet replicated
166921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <>
167921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
168982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen  return vec_splat(a, 1);
169921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood}
170921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood
171921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the third value of the input Packet replicated
172921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <>
173921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
174982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen  return vec_splat(a, 2);
175921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood}
176921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood
177921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood// Return a packet with the fourth value of the input Packet replicated
178921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Soodtemplate <>
179921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav SoodEIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
180982549ea3423df4270ff154e5c764beb43d472daRasmus Munk Larsen  return vec_splat(a, 3);
181921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood}
182921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood#endif
183921329b4a9b2fbf4c6904121aef3d09398febe45Vaibhav Sood
1847cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_SSE2
1857cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For PacketSize of 4 floats the Packet is not modified
1867cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
1877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pinterleave4x64<Packet4f>(const Packet4f& from) {
1887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return from;
1897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values
1927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
1937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
1947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
1957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
1967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
1977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
1987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
1997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values
2007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
2017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
2027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
2037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
2047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
2057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats expanded from 4 bfloat16 values
2087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the lower half of the 128-bit lane
2097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
2107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
2117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
2127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castps_si128(from);
2137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
2147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats expanded from 4 bfloat16 values
2177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// in the upper half of the 128-bit lane
2187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
2197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
2207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
2217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castps_si128(from);
2227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_castsi128_ps(_mm_unpackhi_epi16(zero, tmp));
2237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the first value of the input Packet replicated
2267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
2277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
2287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_set1_ps(pfirst<Packet4f>(a));
2297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the second value of the input Packet replicated
2327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
2337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
2347cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 1)));
2357cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2367cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2377cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the third value of the input Packet replicated
2387cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
2397cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
2407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 2)));
2417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the fourth value of the input Packet replicated
2447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
2457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
2467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 3)));
2477cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
2487cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
2497cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
2507cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
25135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512
25235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
25335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f
25435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_first<Packet16f>(const Packet16f& a_in) {
25535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4f a = _mm512_castps512_ps128(a_in);
25635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastss_ps(a);
25735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
25835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
25935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f
26035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_second<Packet16f>(const Packet16f& a_in) {
26135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4f a = _mm512_castps512_ps128(a_in);
26235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
26335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
26435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
26535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f
26635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_third<Packet16f>(const Packet16f& a_in) {
26735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4f a = _mm512_castps512_ps128(a_in);
26835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
26935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
27035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
27135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f
27235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_fourth<Packet16f>(const Packet16f& a_in) {
27335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4f a = _mm512_castps512_ps128(a_in);
27435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
27535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
27635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
27735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) {
27835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet2d a = _mm512_castpd512_pd128(a_in);
27935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastsd_pd(a);
28035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
28135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
28235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
28335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3);
28435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastsd_pd(a);
28535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
28635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
28735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
28823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
28935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastsd_pd(a);
29035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
29135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
29235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
29323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  Packet2d a =
29423caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
29535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastsd_pd(a);
29635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
29735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
29835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i
29935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_first<Packet16i>(const Packet16i& a_in) {
30035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4i a = _mm512_castsi512_si128(a_in);
30135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastd_epi32(a);
30235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
30335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
30435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i
30535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_second<Packet16i>(const Packet16i& a_in) {
30635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4i a = _mm512_castsi512_si128(a_in);
30735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)));
30835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
30935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
31035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i
31135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_third<Packet16i>(const Packet16i& a_in) {
31235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4i a = _mm512_castsi512_si128(a_in);
31335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)));
31435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
31535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
31635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16i
31735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerpbroadcast_fourth<Packet16i>(const Packet16i& a_in) {
31835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  Packet4i a = _mm512_castsi512_si128(a_in);
31935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)));
32035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
32135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif
32235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
3237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX
3247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
3257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
3267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) {
3277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2
3287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from),
3297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower                                                      _MM_SHUFFLE(3, 1, 2, 0)));
3307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else
331db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2);
332db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3);
333db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4);
334db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5);
335db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4);
336db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5);
337db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2);
338db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3);
339db769cc4f04bac93221cdbe16cf4ed2e9785163eA. Unique TensorFlower  return _mm256_castsi256_ps(tmp5);
3407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
3417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
3427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values
3437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
3447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pload4bf16<Packet8f>(const float* from) {
3457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
3467cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
3477cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_castps128_ps256(
3487cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
3497cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
3507cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values
3517cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
3527cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) {
3537cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
3547cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
3557cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_castps128_ps256(
3567cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
3577cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
3587cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
35935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512
36035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower// Return a Packet with 4 floats loaded from 4 bfloat16 values
36135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
36235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) {
36335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
36435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
36535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_castps128_ps512(
36635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
36735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
36835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower// Return a Packet with 2 floats loaded from 2 bfloat16 values
36935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <>
37035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) {
37135b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
37235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
37335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower  return _mm512_castps128_ps512(
37435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
37535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
37635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif
37735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
3787cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
3797cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// of the 128-bit lane
3807cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
3817cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_l(const Packet8f& from) {
3827cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2
3837cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256i zero = _mm256_setzero_si256();
3847cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256i tmp = _mm256_castps_si256(from);
3857cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_castsi256_ps(_mm256_unpacklo_epi16(zero, tmp));
3867cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else
3877cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
3887cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0));
3897cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i res_l = _mm_unpacklo_epi16(zero, low);
3907cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1));
3917cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i res_h = _mm_unpacklo_epi16(zero, high);
3927cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l));
3937cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1);
3947cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return res;
3957cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
3967cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
3977cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
3987cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// For each 128-bit lane convert 4 bfloat to 4 float values from the upper half
3997cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// of the 128-bit lane
4007cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <typename Packet>
4017cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_u(const Packet8f& from) {
4027cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX2
4037cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256i zero = _mm256_setzero_si256();
4047cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256i tmp = _mm256_castps_si256(from);
4057cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_castsi256_ps(_mm256_unpackhi_epi16(zero, tmp));
4067cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#else
4077cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i zero = _mm_setzero_si128();
4087cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0));
4097cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i res_l = _mm_unpackhi_epi16(zero, low);
4107cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1));
4117cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m128i res_h = _mm_unpackhi_epi16(zero, high);
4127cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l));
4137cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1);
4147cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return res;
4157cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
4167cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
4177cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
4187cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the first value of the input Packet replicated
4197cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
4207cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_first<Packet8f>(const Packet8f& a) {
4217cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_set1_ps(pfirst<Packet8f>(a));
4227cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
4237cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
4247cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the second value of the input Packet replicated
4257cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
4267cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_second<Packet8f>(const Packet8f& a) {
4277cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_set1_ps(
4287cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 1))));
4297cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
4307cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
4317cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the third value of the input Packet replicated
4327cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
4337cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_third<Packet8f>(const Packet8f& a) {
4347cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_set1_ps(
4357cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 2))));
4367cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
4377cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
4387cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower// Return a packet with the fourth value of the input Packet replicated
4397cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowertemplate <>
4407cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlowerEIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
4417cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower  return _mm256_set1_ps(
4427cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
4437cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}
4447cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower
4457cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
44635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
44735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#ifdef EIGEN_VECTORIZE_AVX512
44835b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
44935b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <typename Packet>
45035b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
45123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return _mm512_castsi512_ps(_mm512_slli_epi32(
45223caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
45323caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      16));
45435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
45535b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
45635b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowertemplate <typename Packet>
45735b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlowerEIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
45823caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  Packet16i tmp = _mm512_castps_si512(from);
45923caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  Packet16i tmp2 = _mm512_alignr_epi32(tmp, tmp, 8);
46023caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner  return _mm512_castsi512_ps(_mm512_slli_epi32(
46123caaa5e42c87a189511438dcadc428b683cd028Benoit Steiner      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp2)), 16));
46235b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower}
46335b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower
46435b6050b574e6b4c4cecf8db2a0c37e48d43b9eaA. Unique TensorFlower#endif
4657cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}  // namespace internal
4667cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower}  // namespace Eigen
4677cddb41a31650ec57cc2110bfd9f2a8eb9b42613A. Unique TensorFlower#endif
468