1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_PACKET_MATH_ALTIVEC_H 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_HAS_FUSE_CJMADD 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_HAS_FUSE_CJMADD 1 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector float Packet4f; 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector int Packet4i; 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector unsigned int Packet4ui; 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector __bool int Packet4bi; 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector short int Packet8i; 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef __vector unsigned char Packet16uc; 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// We don't want to write the same code all the time, but we need to reuse the constants 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// and it doesn't really work to declare them global, so we define macros instead 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p4i_##NAME = vec_splat_s32(X) 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = pset1<Packet4f>(X) 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p4i_##NAME = pset1<Packet4i>(X) 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define DST_CHAN 1 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Define global static constants: 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 }; 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 }; 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7}; 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<float> : default_packet_traits 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4f type; 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4, 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSin = 0, 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasCos = 0, 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasLog = 0, 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasExp = 0, 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSqrt = 0 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<int> : default_packet_traits 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4i type; 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f v; 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float n[4]; 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i v; 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int n[4]; 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4ui v; 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath unsigned int n[4]; 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathinline std::ostream & operator <<(std::ostream & s, const Packetbi & v) 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath union { 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4bi v; 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath unsigned int n[4]; 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } vt; 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vt.v = v; 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s; 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath*/ 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float EIGEN_ALIGN16 af[4]; 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath af[0] = from; 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f vc = vec_ld(0, af); 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vc = vec_splat(vc, 0); 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vc; 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int EIGEN_ALIGN16 ai[4]; 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ai[0] = from; 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i vc = vec_ld(0, ai); 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vc = vec_splat(vc, 0); 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vc; 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); } 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); } 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } 177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Commented out: it's actually slower than processing it scalar 178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Set up constants, variables 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the absolute values 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a1 = vec_abs(a); 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b1 = vec_abs(b); 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the signs using xor 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the multiplication for the asbolute values. 193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); 194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); 195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); 196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); 197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vec_add( low_prod, high_prod ); 198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NOR the product and select only the negative elements according to the sign mask 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_nor(prod, prod); 201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_sel(p4i_ZERO, prod_, sgn); 202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add 1 to the result to get the negative numbers 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod_ = vec_add(prod_, v1sel); 206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Merge the results back to the final vector. 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vec_sel(prod, prod_, sgn); 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return prod; 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath*/ 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f t, y_0, y_1, res; 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y_0 = vec_re(b); 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do one Newton-Raphson iteration to get the needed accuracy 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t = vec_nmsub(y_0, b, p4f_ONE); 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y_1 = vec_madd(y_0, t, y_0); 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_madd(a, y_1, p4f_ZERO); 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return res; 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ eigen_assert(false && "packet integer division are not supported by AltiVec"); 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pset1<Packet4i>(0); 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for some weird raisons, it has to be overloaded for packet of integers 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_ALIGNED_LOAD 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ; 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc mask; 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath mask = vec_lvsl(0, from); // create the permute mask 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_ALIGNED_LOAD 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ; 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc mask; 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath mask = vec_lvsl(0, from); // create the permute mask 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f p; 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from); 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else p = ploadu<Packet4f>(from); 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vec_perm(p, p, p16uc_DUPLICATE); 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i p; 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from); 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else p = ploadu<Packet4i>(from); 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vec_perm(p, p, p16uc_DUPLICATE); 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_UNALIGNED_STORE 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Warning: not thread safe! 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ, edges; 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc edgeAlign, align; 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edgeAlign = vec_lvsl(0, to); // permute map to extract edges 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath align = vec_lvsr( 0, to ); // permute map to misalign data 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_DEBUG_UNALIGNED_STORE 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Warning: not thread safe! 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc MSQ, LSQ, edges; 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet16uc edgeAlign, align; 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edgeAlign = vec_lvsl(0, to); // permute map to extract edges 330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath align = vec_lvsr( 0, to ); // permute map to misalign data 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, sum; 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = (Packet4f) vec_sld(a, a, 8); 354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_add(a, b); 355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = (Packet4f) vec_sld(sum, sum, 4); 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_add(sum, b); 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(sum); 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f v[4], sum[4]; 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // It's easier and faster to transpose then add as columns 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the transpose, first set of moves 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[0] = vec_mergeh(vecs[0], vecs[2]); 368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[1] = vec_mergel(vecs[0], vecs[2]); 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[2] = vec_mergeh(vecs[1], vecs[3]); 370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[3] = vec_mergel(vecs[1], vecs[3]); 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the resulting vectors 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_mergeh(v[0], v[2]); 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_mergel(v[0], v[2]); 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[2] = vec_mergeh(v[1], v[3]); 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[3] = vec_mergel(v[1], v[3]); 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Now do the summation: 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 0+1 379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 2+3 381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_add(sum[2], sum[3]); 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add the results 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum[0]; 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i sum; 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_sums(a, p4i_ZERO); 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vec_sld(sum, p4i_ZERO, 12); 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(sum); 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i v[4], sum[4]; 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // It's easier and faster to transpose then add as columns 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the transpose, first set of moves 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[0] = vec_mergeh(vecs[0], vecs[2]); 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[1] = vec_mergel(vecs[0], vecs[2]); 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[2] = vec_mergeh(vecs[1], vecs[3]); 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath v[3] = vec_mergel(vecs[1], vecs[3]); 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the resulting vectors 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_mergeh(v[0], v[2]); 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_mergel(v[0], v[2]); 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[2] = vec_mergeh(v[1], v[3]); 411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[3] = vec_mergel(v[1], v[3]); 412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Now do the summation: 414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 0+1 415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Lines 2+3 417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[1] = vec_add(sum[2], sum[3]); 418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Add the results 419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum[0] = vec_add(sum[0], sum[1]); 420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum[0]; 422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Other reduction functions: 425c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// mul 426c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 427c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 428c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f prod; 429c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); 430c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); 431c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 432c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 433c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 434c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 435c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_ALIGN16 int aux[4]; 436c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(aux, a); 437c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return aux[0] * aux[1] * aux[2] * aux[3]; 438c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 439c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 440c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// min 441c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 442c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 443c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, res; 444c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_min(a, vec_sld(a, a, 8)); 445c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_min(b, vec_sld(b, b, 4)); 446c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 447c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 448c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 449c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 450c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 451c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i b, res; 452c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_min(a, vec_sld(a, a, 8)); 453c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_min(b, vec_sld(b, b, 4)); 454c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 455c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 456c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 457c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// max 458c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 459c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 460c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f b, res; 461c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_max(a, vec_sld(a, a, 8)); 462c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_max(b, vec_sld(b, b, 4)); 463c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 464c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 465c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 466c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 467c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 468c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i b, res; 469c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b = vec_max(a, vec_sld(a, a, 8)); 470c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res = vec_max(b, vec_sld(b, b, 4)); 471c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pfirst(res); 472c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 473c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 474c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int Offset> 475c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Packet4f> 476c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 477c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 478c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 479c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0) 480c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = vec_sld(first, second, Offset*4); 481c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 482c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 483c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 484c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int Offset> 485c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Packet4i> 486c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 487c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 488c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 489c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0) 490c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = vec_sld(first, second, Offset*4); 491c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 492c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 493c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 494c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 495c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 496c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 497c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 498c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_PACKET_MATH_ALTIVEC_H 499