1// This file is part of Eigen, a lightweight C++ template library 2// for linear algebra. 3// 4// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org> 5// 6// This Source Code Form is subject to the terms of the Mozilla 7// Public License v. 2.0. If a copy of the MPL was not distributed 8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11#define EIGEN_PACKET_MATH_ALTIVEC_H 12 13namespace Eigen { 14 15namespace internal { 16 17#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19#endif 20 21#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23#endif 24 25#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 26#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 27#endif 28 29// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 30#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 31#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 32#endif 33 34typedef __vector float Packet4f; 35typedef __vector int Packet4i; 36typedef __vector unsigned int Packet4ui; 37typedef __vector __bool int Packet4bi; 38typedef __vector short int Packet8i; 39typedef __vector unsigned char Packet16uc; 40 41// We don't want to write the same code all the time, but we need to reuse the constants 42// and it doesn't really work to declare them global, so we define macros instead 43 44#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 45 Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X)) 46 47#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 48 Packet4i p4i_##NAME = vec_splat_s32(X) 49 50#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 51 Packet4f p4f_##NAME = pset1<Packet4f>(X) 52 53#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 54 Packet4i p4i_##NAME = pset1<Packet4i>(X) 55 56#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 57 Packet2d p2d_##NAME = pset1<Packet2d>(X) 58 59#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 60 Packet2l p2l_##NAME = pset1<Packet2l>(X) 61 62#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 63 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 64 65#define DST_CHAN 1 66#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 67 68 69// These constants are endian-agnostic 70static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 71static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 72static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 73static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 74static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 75static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 76#ifndef __VSX__ 77static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 78#endif 79 80static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 81static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 82 83static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 84static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 85 86// Mask alignment 87#ifdef __PPC64__ 88#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 89#else 90#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 91#endif 92 93#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) 94 95// Handle endianness properly while loading constants 96// Define global static constants: 97#ifdef _BIG_ENDIAN 98static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 99#ifdef __VSX__ 100static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 101#endif 102static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 103static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 104static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 105#else 106static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 107static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 108static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 109static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 110static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 111#endif // _BIG_ENDIAN 112 113static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 114static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 115static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 116static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 117 118static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 119 120#ifdef _BIG_ENDIAN 121static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 122#else 123static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 124#endif // _BIG_ENDIAN 125 126#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 127 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); 128#else 129 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); 130#endif 131 132template<> struct packet_traits<float> : default_packet_traits 133{ 134 typedef Packet4f type; 135 typedef Packet4f half; 136 enum { 137 Vectorizable = 1, 138 AlignedOnScalar = 1, 139 size=4, 140 HasHalfPacket = 1, 141 142 HasAdd = 1, 143 HasSub = 1, 144 HasMul = 1, 145 HasDiv = 1, 146 HasMin = 1, 147 HasMax = 1, 148 HasAbs = 1, 149 HasSin = 0, 150 HasCos = 0, 151 HasLog = 0, 152 HasExp = 1, 153#ifdef __VSX__ 154 HasSqrt = 1, 155#if !EIGEN_COMP_CLANG 156 HasRsqrt = 1, 157#else 158 HasRsqrt = 0, 159#endif 160#else 161 HasSqrt = 0, 162 HasRsqrt = 0, 163#endif 164 HasRound = 1, 165 HasFloor = 1, 166 HasCeil = 1, 167 HasNegate = 1, 168 HasBlend = 1 169 }; 170}; 171template<> struct packet_traits<int> : default_packet_traits 172{ 173 typedef Packet4i type; 174 typedef Packet4i half; 175 enum { 176 Vectorizable = 1, 177 AlignedOnScalar = 1, 178 size = 4, 179 HasHalfPacket = 0, 180 181 HasAdd = 1, 182 HasSub = 1, 183 HasMul = 1, 184 HasDiv = 0, 185 HasBlend = 1 186 }; 187}; 188 189 190template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; 191template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; 192 193inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 194{ 195 union { 196 Packet16uc v; 197 unsigned char n[16]; 198 } vt; 199 vt.v = v; 200 for (int i=0; i< 16; i++) 201 s << (int)vt.n[i] << ", "; 202 return s; 203} 204 205inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 206{ 207 union { 208 Packet4f v; 209 float n[4]; 210 } vt; 211 vt.v = v; 212 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 213 return s; 214} 215 216inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 217{ 218 union { 219 Packet4i v; 220 int n[4]; 221 } vt; 222 vt.v = v; 223 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 224 return s; 225} 226 227inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 228{ 229 union { 230 Packet4ui v; 231 unsigned int n[4]; 232 } vt; 233 vt.v = v; 234 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 235 return s; 236} 237 238// Need to define them first or we get specialization after instantiation errors 239template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) 240{ 241 EIGEN_DEBUG_ALIGNED_LOAD 242#ifdef __VSX__ 243 return vec_vsx_ld(0, from); 244#else 245 return vec_ld(0, from); 246#endif 247} 248 249template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) 250{ 251 EIGEN_DEBUG_ALIGNED_LOAD 252#ifdef __VSX__ 253 return vec_vsx_ld(0, from); 254#else 255 return vec_ld(0, from); 256#endif 257} 258 259template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) 260{ 261 EIGEN_DEBUG_ALIGNED_STORE 262#ifdef __VSX__ 263 vec_vsx_st(from, 0, to); 264#else 265 vec_st(from, 0, to); 266#endif 267} 268 269template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) 270{ 271 EIGEN_DEBUG_ALIGNED_STORE 272#ifdef __VSX__ 273 vec_vsx_st(from, 0, to); 274#else 275 vec_st(from, 0, to); 276#endif 277} 278 279template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 280 Packet4f v = {from, from, from, from}; 281 return v; 282} 283 284template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 285 Packet4i v = {from, from, from, from}; 286 return v; 287} 288template<> EIGEN_STRONG_INLINE void 289pbroadcast4<Packet4f>(const float *a, 290 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 291{ 292 a3 = pload<Packet4f>(a); 293 a0 = vec_splat(a3, 0); 294 a1 = vec_splat(a3, 1); 295 a2 = vec_splat(a3, 2); 296 a3 = vec_splat(a3, 3); 297} 298template<> EIGEN_STRONG_INLINE void 299pbroadcast4<Packet4i>(const int *a, 300 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 301{ 302 a3 = pload<Packet4i>(a); 303 a0 = vec_splat(a3, 0); 304 a1 = vec_splat(a3, 1); 305 a2 = vec_splat(a3, 2); 306 a3 = vec_splat(a3, 3); 307} 308 309template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 310{ 311 float EIGEN_ALIGN16 af[4]; 312 af[0] = from[0*stride]; 313 af[1] = from[1*stride]; 314 af[2] = from[2*stride]; 315 af[3] = from[3*stride]; 316 return pload<Packet4f>(af); 317} 318template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 319{ 320 int EIGEN_ALIGN16 ai[4]; 321 ai[0] = from[0*stride]; 322 ai[1] = from[1*stride]; 323 ai[2] = from[2*stride]; 324 ai[3] = from[3*stride]; 325 return pload<Packet4i>(ai); 326} 327template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 328{ 329 float EIGEN_ALIGN16 af[4]; 330 pstore<float>(af, from); 331 to[0*stride] = af[0]; 332 to[1*stride] = af[1]; 333 to[2*stride] = af[2]; 334 to[3*stride] = af[3]; 335} 336template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 337{ 338 int EIGEN_ALIGN16 ai[4]; 339 pstore<int>((int *)ai, from); 340 to[0*stride] = ai[0]; 341 to[1*stride] = ai[1]; 342 to[2*stride] = ai[2]; 343 to[3*stride] = ai[3]; 344} 345 346template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } 347template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } 348 349template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; } 350template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; } 351 352template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; } 353template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; } 354 355template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } 356template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } 357 358template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 359template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 360 361template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } 362template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; } 363 364template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 365{ 366#ifndef __VSX__ // VSX actually provides a div instruction 367 Packet4f t, y_0, y_1; 368 369 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 370 y_0 = vec_re(b); 371 372 // Do one Newton-Raphson iteration to get the needed accuracy 373 t = vec_nmsub(y_0, b, p4f_ONE); 374 y_1 = vec_madd(y_0, t, y_0); 375 376 return vec_madd(a, y_1, p4f_MZERO); 377#else 378 return vec_div(a, b); 379#endif 380} 381 382template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 383{ eigen_assert(false && "packet integer division are not supported by AltiVec"); 384 return pset1<Packet4i>(0); 385} 386 387// for some weird raisons, it has to be overloaded for packet of integers 388template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } 389template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } 390 391template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 392template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 393 394template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 395template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 396 397template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 398template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 399 400template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 401template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 402 403template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 404template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 405 406template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 407template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 408 409template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); } 410template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } 411template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } 412 413#ifdef _BIG_ENDIAN 414template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 415{ 416 EIGEN_DEBUG_ALIGNED_LOAD 417 Packet16uc MSQ, LSQ; 418 Packet16uc mask; 419 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 420 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 421 mask = vec_lvsl(0, from); // create the permute mask 422 return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 423 424} 425template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 426{ 427 EIGEN_DEBUG_ALIGNED_LOAD 428 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 429 Packet16uc MSQ, LSQ; 430 Packet16uc mask; 431 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 432 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 433 mask = vec_lvsl(0, from); // create the permute mask 434 return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 435} 436#else 437// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 438template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 439{ 440 EIGEN_DEBUG_UNALIGNED_LOAD 441 return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); 442} 443template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 444{ 445 EIGEN_DEBUG_UNALIGNED_LOAD 446 return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); 447} 448#endif 449 450template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 451{ 452 Packet4f p; 453 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); 454 else p = ploadu<Packet4f>(from); 455 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 456} 457template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 458{ 459 Packet4i p; 460 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); 461 else p = ploadu<Packet4i>(from); 462 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 463} 464 465#ifdef _BIG_ENDIAN 466template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 467{ 468 EIGEN_DEBUG_UNALIGNED_STORE 469 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 470 // Warning: not thread safe! 471 Packet16uc MSQ, LSQ, edges; 472 Packet16uc edgeAlign, align; 473 474 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 475 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 476 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 477 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 478 align = vec_lvsr( 0, to ); // permute map to misalign data 479 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 480 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 481 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 482 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 483} 484template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 485{ 486 EIGEN_DEBUG_UNALIGNED_STORE 487 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 488 // Warning: not thread safe! 489 Packet16uc MSQ, LSQ, edges; 490 Packet16uc edgeAlign, align; 491 492 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 493 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 494 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 495 edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 496 align = vec_lvsr( 0, to ); // permute map to misalign data 497 MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 498 LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 499 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 500 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 501} 502#else 503// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 504template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 505{ 506 EIGEN_DEBUG_ALIGNED_STORE 507 vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); 508} 509template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 510{ 511 EIGEN_DEBUG_ALIGNED_STORE 512 vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 513} 514#endif 515 516template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } 517template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } 518 519template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 520template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 521 522template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 523{ 524 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 525} 526template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 527{ 528 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); } 529 530template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 531template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 532 533template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 534{ 535 Packet4f b, sum; 536 b = vec_sld(a, a, 8); 537 sum = a + b; 538 b = vec_sld(sum, sum, 4); 539 sum += b; 540 return pfirst(sum); 541} 542 543template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 544{ 545 Packet4f v[4], sum[4]; 546 547 // It's easier and faster to transpose then add as columns 548 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 549 // Do the transpose, first set of moves 550 v[0] = vec_mergeh(vecs[0], vecs[2]); 551 v[1] = vec_mergel(vecs[0], vecs[2]); 552 v[2] = vec_mergeh(vecs[1], vecs[3]); 553 v[3] = vec_mergel(vecs[1], vecs[3]); 554 // Get the resulting vectors 555 sum[0] = vec_mergeh(v[0], v[2]); 556 sum[1] = vec_mergel(v[0], v[2]); 557 sum[2] = vec_mergeh(v[1], v[3]); 558 sum[3] = vec_mergel(v[1], v[3]); 559 560 // Now do the summation: 561 // Lines 0+1 562 sum[0] = sum[0] + sum[1]; 563 // Lines 2+3 564 sum[1] = sum[2] + sum[3]; 565 // Add the results 566 sum[0] = sum[0] + sum[1]; 567 568 return sum[0]; 569} 570 571template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 572{ 573 Packet4i sum; 574 sum = vec_sums(a, p4i_ZERO); 575#ifdef _BIG_ENDIAN 576 sum = vec_sld(sum, p4i_ZERO, 12); 577#else 578 sum = vec_sld(p4i_ZERO, sum, 4); 579#endif 580 return pfirst(sum); 581} 582 583template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 584{ 585 Packet4i v[4], sum[4]; 586 587 // It's easier and faster to transpose then add as columns 588 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 589 // Do the transpose, first set of moves 590 v[0] = vec_mergeh(vecs[0], vecs[2]); 591 v[1] = vec_mergel(vecs[0], vecs[2]); 592 v[2] = vec_mergeh(vecs[1], vecs[3]); 593 v[3] = vec_mergel(vecs[1], vecs[3]); 594 // Get the resulting vectors 595 sum[0] = vec_mergeh(v[0], v[2]); 596 sum[1] = vec_mergel(v[0], v[2]); 597 sum[2] = vec_mergeh(v[1], v[3]); 598 sum[3] = vec_mergel(v[1], v[3]); 599 600 // Now do the summation: 601 // Lines 0+1 602 sum[0] = sum[0] + sum[1]; 603 // Lines 2+3 604 sum[1] = sum[2] + sum[3]; 605 // Add the results 606 sum[0] = sum[0] + sum[1]; 607 608 return sum[0]; 609} 610 611// Other reduction functions: 612// mul 613template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 614{ 615 Packet4f prod; 616 prod = pmul(a, vec_sld(a, a, 8)); 617 return pfirst(pmul(prod, vec_sld(prod, prod, 4))); 618} 619 620template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 621{ 622 EIGEN_ALIGN16 int aux[4]; 623 pstore(aux, a); 624 return aux[0] * aux[1] * aux[2] * aux[3]; 625} 626 627// min 628template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 629{ 630 Packet4f b, res; 631 b = vec_min(a, vec_sld(a, a, 8)); 632 res = vec_min(b, vec_sld(b, b, 4)); 633 return pfirst(res); 634} 635 636template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 637{ 638 Packet4i b, res; 639 b = vec_min(a, vec_sld(a, a, 8)); 640 res = vec_min(b, vec_sld(b, b, 4)); 641 return pfirst(res); 642} 643 644// max 645template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 646{ 647 Packet4f b, res; 648 b = vec_max(a, vec_sld(a, a, 8)); 649 res = vec_max(b, vec_sld(b, b, 4)); 650 return pfirst(res); 651} 652 653template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 654{ 655 Packet4i b, res; 656 b = vec_max(a, vec_sld(a, a, 8)); 657 res = vec_max(b, vec_sld(b, b, 4)); 658 return pfirst(res); 659} 660 661template<int Offset> 662struct palign_impl<Offset,Packet4f> 663{ 664 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 665 { 666#ifdef _BIG_ENDIAN 667 switch (Offset % 4) { 668 case 1: 669 first = vec_sld(first, second, 4); break; 670 case 2: 671 first = vec_sld(first, second, 8); break; 672 case 3: 673 first = vec_sld(first, second, 12); break; 674 } 675#else 676 switch (Offset % 4) { 677 case 1: 678 first = vec_sld(second, first, 12); break; 679 case 2: 680 first = vec_sld(second, first, 8); break; 681 case 3: 682 first = vec_sld(second, first, 4); break; 683 } 684#endif 685 } 686}; 687 688template<int Offset> 689struct palign_impl<Offset,Packet4i> 690{ 691 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 692 { 693#ifdef _BIG_ENDIAN 694 switch (Offset % 4) { 695 case 1: 696 first = vec_sld(first, second, 4); break; 697 case 2: 698 first = vec_sld(first, second, 8); break; 699 case 3: 700 first = vec_sld(first, second, 12); break; 701 } 702#else 703 switch (Offset % 4) { 704 case 1: 705 first = vec_sld(second, first, 12); break; 706 case 2: 707 first = vec_sld(second, first, 8); break; 708 case 3: 709 first = vec_sld(second, first, 4); break; 710 } 711#endif 712 } 713}; 714 715EIGEN_DEVICE_FUNC inline void 716ptranspose(PacketBlock<Packet4f,4>& kernel) { 717 Packet4f t0, t1, t2, t3; 718 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 719 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 720 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 721 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 722 kernel.packet[0] = vec_mergeh(t0, t2); 723 kernel.packet[1] = vec_mergel(t0, t2); 724 kernel.packet[2] = vec_mergeh(t1, t3); 725 kernel.packet[3] = vec_mergel(t1, t3); 726} 727 728EIGEN_DEVICE_FUNC inline void 729ptranspose(PacketBlock<Packet4i,4>& kernel) { 730 Packet4i t0, t1, t2, t3; 731 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 732 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 733 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 734 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 735 kernel.packet[0] = vec_mergeh(t0, t2); 736 kernel.packet[1] = vec_mergel(t0, t2); 737 kernel.packet[2] = vec_mergeh(t1, t3); 738 kernel.packet[3] = vec_mergel(t1, t3); 739} 740 741template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { 742 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 743 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 744 return vec_sel(elsePacket, thenPacket, mask); 745} 746 747template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { 748 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 749 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 750 return vec_sel(elsePacket, thenPacket, mask); 751} 752 753 754//---------- double ---------- 755#ifdef __VSX__ 756typedef __vector double Packet2d; 757typedef __vector unsigned long long Packet2ul; 758typedef __vector long long Packet2l; 759#if EIGEN_COMP_CLANG 760typedef Packet2ul Packet2bl; 761#else 762typedef __vector __bool long Packet2bl; 763#endif 764 765static Packet2l p2l_ONE = { 1, 1 }; 766static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); 767static Packet2d p2d_ONE = { 1.0, 1.0 }; 768static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); 769static Packet2d p2d_MZERO = { -0.0, -0.0 }; 770 771#ifdef _BIG_ENDIAN 772static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); 773#else 774static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); 775#endif 776 777template<int index> Packet2d vec_splat_dbl(Packet2d& a); 778 779template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) 780{ 781 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI)); 782} 783 784template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) 785{ 786 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO)); 787} 788 789template<> struct packet_traits<double> : default_packet_traits 790{ 791 typedef Packet2d type; 792 typedef Packet2d half; 793 enum { 794 Vectorizable = 1, 795 AlignedOnScalar = 1, 796 size=2, 797 HasHalfPacket = 1, 798 799 HasAdd = 1, 800 HasSub = 1, 801 HasMul = 1, 802 HasDiv = 1, 803 HasMin = 1, 804 HasMax = 1, 805 HasAbs = 1, 806 HasSin = 0, 807 HasCos = 0, 808 HasLog = 0, 809 HasExp = 1, 810 HasSqrt = 1, 811 HasRsqrt = 1, 812 HasRound = 1, 813 HasFloor = 1, 814 HasCeil = 1, 815 HasNegate = 1, 816 HasBlend = 1 817 }; 818}; 819 820template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; 821 822inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) 823{ 824 union { 825 Packet2l v; 826 int64_t n[2]; 827 } vt; 828 vt.v = v; 829 s << vt.n[0] << ", " << vt.n[1]; 830 return s; 831} 832 833inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 834{ 835 union { 836 Packet2d v; 837 double n[2]; 838 } vt; 839 vt.v = v; 840 s << vt.n[0] << ", " << vt.n[1]; 841 return s; 842} 843 844// Need to define them first or we get specialization after instantiation errors 845template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) 846{ 847 EIGEN_DEBUG_ALIGNED_LOAD 848#ifdef __VSX__ 849 return vec_vsx_ld(0, from); 850#else 851 return vec_ld(0, from); 852#endif 853} 854 855template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) 856{ 857 EIGEN_DEBUG_ALIGNED_STORE 858#ifdef __VSX__ 859 vec_vsx_st(from, 0, to); 860#else 861 vec_st(from, 0, to); 862#endif 863} 864 865template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 866 Packet2d v = {from, from}; 867 return v; 868} 869 870template<> EIGEN_STRONG_INLINE void 871pbroadcast4<Packet2d>(const double *a, 872 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 873{ 874 a1 = pload<Packet2d>(a); 875 a0 = vec_splat_dbl<0>(a1); 876 a1 = vec_splat_dbl<1>(a1); 877 a3 = pload<Packet2d>(a+2); 878 a2 = vec_splat_dbl<0>(a3); 879 a3 = vec_splat_dbl<1>(a3); 880} 881 882template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 883{ 884 double EIGEN_ALIGN16 af[2]; 885 af[0] = from[0*stride]; 886 af[1] = from[1*stride]; 887 return pload<Packet2d>(af); 888} 889template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 890{ 891 double EIGEN_ALIGN16 af[2]; 892 pstore<double>(af, from); 893 to[0*stride] = af[0]; 894 to[1*stride] = af[1]; 895} 896 897template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } 898 899template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } 900 901template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } 902 903template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } 904 905template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 906 907template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } 908template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 909 910// for some weird raisons, it has to be overloaded for packet of integers 911template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 912 913template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } 914 915template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } 916 917template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 918 919template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 920 921template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 922 923template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 924 925template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } 926template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } 927template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } 928 929template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 930{ 931 EIGEN_DEBUG_ALIGNED_LOAD 932 return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); 933} 934 935template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 936{ 937 Packet2d p; 938 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 939 else p = ploadu<Packet2d>(from); 940 return vec_splat_dbl<0>(p); 941} 942 943template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 944{ 945 EIGEN_DEBUG_ALIGNED_STORE 946 vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 947} 948 949template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } 950 951template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; } 952 953template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 954{ 955 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); 956} 957template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 958 959template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 960{ 961 Packet2d b, sum; 962 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); 963 sum = a + b; 964 return pfirst<Packet2d>(sum); 965} 966 967template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 968{ 969 Packet2d v[2], sum; 970 v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); 971 v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); 972 973#ifdef _BIG_ENDIAN 974 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); 975#else 976 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8)); 977#endif 978 979 return sum; 980} 981// Other reduction functions: 982// mul 983template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 984{ 985 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 986} 987 988// min 989template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 990{ 991 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 992} 993 994// max 995template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 996{ 997 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 998} 999 1000template<int Offset> 1001struct palign_impl<Offset,Packet2d> 1002{ 1003 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 1004 { 1005 if (Offset == 1) 1006#ifdef _BIG_ENDIAN 1007 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8)); 1008#else 1009 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8)); 1010#endif 1011 } 1012}; 1013 1014EIGEN_DEVICE_FUNC inline void 1015ptranspose(PacketBlock<Packet2d,2>& kernel) { 1016 Packet2d t0, t1; 1017 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 1018 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 1019 kernel.packet[0] = t0; 1020 kernel.packet[1] = t1; 1021} 1022 1023template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { 1024 Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; 1025 Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); 1026 return vec_sel(elsePacket, thenPacket, mask); 1027} 1028#endif // __VSX__ 1029} // end namespace internal 1030 1031} // end namespace Eigen 1032 1033#endif // EIGEN_PACKET_MATH_ALTIVEC_H 1034