1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_PACKET_MATH_ALTIVEC_H 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_HAS_FUSE_CJMADD 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_HAS_FUSE_CJMADD 1 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector float Packet4f; 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector int Packet4i; 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector unsigned int Packet4ui; 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector __bool int Packet4bi; 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector short int Packet8i; 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector unsigned char Packet16uc; 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// We don't want to write the same code all the time, but we need to reuse the constants 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// and it doesn't really work to declare them global, so we define macros instead 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p4i_##NAME = vec_splat_s32(X) 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = pset1<Packet4f>(X) 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p4i_##NAME = pset1<Packet4i>(X) 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define DST_CHAN 1 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Define global static constants: 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 }; 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 }; 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7}; 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<float> : default_packet_traits 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4f type; 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4, 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSin = 0, 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasCos = 0, 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasLog = 0, 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasExp = 0, 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSqrt = 0 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<int> : default_packet_traits 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4i type; 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f v; 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float n[4]; 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i v; 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int n[4]; 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4ui v; 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath unsigned int n[4]; 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packetbi & v) 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4bi v; 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath unsigned int n[4]; 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath*/ 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float EIGEN_ALIGN16 af[4]; 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath af[0] = from; 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f vc = vec_ld(0, af); 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vc = vec_splat(vc, 0); 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vc; 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int EIGEN_ALIGN16 ai[4]; 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ai[0] = from; 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i vc = vec_ld(0, ai); 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vc = vec_splat(vc, 0); 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vc; 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); } 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); } 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 1767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 1777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 1787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Commented out: it's actually slower than processing it scalar 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Set up constants, variables 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the absolute values 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a1 = vec_abs(a); 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b1 = vec_abs(b); 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the signs using xor 193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); 194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the multiplication for the asbolute values. 196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); 197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); 198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); 199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vec_add( low_prod, high_prod ); 201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NOR the product and select only the negative elements according to the sign mask 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_nor(prod, prod); 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_sel(p4i_ZERO, prod_, sgn); 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add 1 to the result to get the negative numbers 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_add(prod_, v1sel); 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Merge the results back to the final vector. 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vec_sel(prod, prod_, sgn); 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return prod; 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath*/ 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f t, y_0, y_1, res; 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y_0 = vec_re(b); 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do one Newton-Raphson iteration to get the needed accuracy 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t = vec_nmsub(y_0, b, p4f_ONE); 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y_1 = vec_madd(y_0, t, y_0); 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_madd(a, y_1, p4f_ZERO); 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return res; 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ eigen_assert(false && "packet integer division are not supported by AltiVec"); 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pset1<Packet4i>(0); 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for some weird raisons, it has to be overloaded for packet of integers 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_ALIGNED_LOAD 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ; 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc mask; 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath mask = vec_lvsl(0, from); // create the permute mask 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_ALIGNED_LOAD 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ; 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc mask; 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath mask = vec_lvsl(0, from); // create the permute mask 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p; 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from); 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else p = ploadu<Packet4f>(from); 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vec_perm(p, p, p16uc_DUPLICATE); 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p; 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from); 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else p = ploadu<Packet4i>(from); 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vec_perm(p, p, p16uc_DUPLICATE); 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_UNALIGNED_STORE 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Warning: not thread safe! 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ, edges; 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc edgeAlign, align; 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edgeAlign = vec_lvsl(0, to); // permute map to extract edges 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath align = vec_lvsr( 0, to ); // permute map to misalign data 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_UNALIGNED_STORE 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Warning: not thread safe! 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ, edges; 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc edgeAlign, align; 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edgeAlign = vec_lvsl(0, to); // permute map to extract edges 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath align = vec_lvsr( 0, to ); // permute map to misalign data 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, sum; 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = (Packet4f) vec_sld(a, a, 8); 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_add(a, b); 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = (Packet4f) vec_sld(sum, sum, 4); 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_add(sum, b); 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(sum); 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f v[4], sum[4]; 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // It's easier and faster to transpose then add as columns 368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the transpose, first set of moves 370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[0] = vec_mergeh(vecs[0], vecs[2]); 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[1] = vec_mergel(vecs[0], vecs[2]); 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[2] = vec_mergeh(vecs[1], vecs[3]); 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[3] = vec_mergel(vecs[1], vecs[3]); 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the resulting vectors 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_mergeh(v[0], v[2]); 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_mergel(v[0], v[2]); 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[2] = vec_mergeh(v[1], v[3]); 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[3] = vec_mergel(v[1], v[3]); 379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Now do the summation: 381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 0+1 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 2+3 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_add(sum[2], sum[3]); 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add the results 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum[0]; 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i sum; 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_sums(a, p4i_ZERO); 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_sld(sum, p4i_ZERO, 12); 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(sum); 397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i v[4], sum[4]; 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // It's easier and faster to transpose then add as columns 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the transpose, first set of moves 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[0] = vec_mergeh(vecs[0], vecs[2]); 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[1] = vec_mergel(vecs[0], vecs[2]); 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[2] = vec_mergeh(vecs[1], vecs[3]); 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[3] = vec_mergel(vecs[1], vecs[3]); 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the resulting vectors 411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_mergeh(v[0], v[2]); 412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_mergel(v[0], v[2]); 413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[2] = vec_mergeh(v[1], v[3]); 414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[3] = vec_mergel(v[1], v[3]); 415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Now do the summation: 417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 0+1 418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 2+3 420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_add(sum[2], sum[3]); 421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add the results 422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum[0]; 425c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 426c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 427c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Other reduction functions: 428c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// mul 429c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 430c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 431c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f prod; 432c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); 433c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); 434c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 435c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 436c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 437c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 438c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_ALIGN16 int aux[4]; 439c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(aux, a); 440c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return aux[0] * aux[1] * aux[2] * aux[3]; 441c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 442c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 443c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// min 444c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 445c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 446c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, res; 447c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_min(a, vec_sld(a, a, 8)); 448c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_min(b, vec_sld(b, b, 4)); 449c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 450c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 451c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 452c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 453c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 454c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i b, res; 455c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_min(a, vec_sld(a, a, 8)); 456c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_min(b, vec_sld(b, b, 4)); 457c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 458c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 459c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 460c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// max 461c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 462c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 463c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, res; 464c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_max(a, vec_sld(a, a, 8)); 465c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_max(b, vec_sld(b, b, 4)); 466c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 467c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 468c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 469c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 470c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 471c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i b, res; 472c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_max(a, vec_sld(a, a, 8)); 473c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_max(b, vec_sld(b, b, 4)); 474c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 475c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 476c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 477c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int Offset> 478c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Packet4f> 479c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 480c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 481c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 482c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0) 483c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = vec_sld(first, second, Offset*4); 484c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 485c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 486c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 487c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int Offset> 488c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Packet4i> 489c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 490c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 491c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 492c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0) 493c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = vec_sld(first, second, Offset*4); 494c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 495c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 496c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 497c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 498c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 499c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 500c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 501c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_PACKET_MATH_ALTIVEC_H 502