PacketMath.h revision c981c48f5bc9aefeffc0bcb0cc3934c2fae179dd
1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr> 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Heavily based on Gael's SSE version. 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_PACKET_MATH_NEON_H 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_PACKET_MATH_NEON_H 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME NEON has 16 quad registers, but since the current register allocator 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// is so bad, it is much better to reduce it to 8 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef float32x4_t Packet4f; 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef int32x4_t Packet4i; 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef uint32x4_t Packet4ui; 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4f p4f_##NAME = pset1<Packet4f>(X) 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4i p4i_##NAME = pset1<Packet4i>(X) 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if defined(__llvm__) && !defined(__clang__) 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Special treatment for Apple's llvm-gcc, its NEON packet types are unions 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}} 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}} 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#else 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Default initializer for packets 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y} 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W} 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef __pld 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<float> : default_packet_traits 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4f type; 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size = 4, 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasDiv = 1, 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSin = 0, 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasCos = 0, 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasLog = 0, 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasExp = 0, 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSqrt = 0 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<int> : default_packet_traits 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4i type; 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__) 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// workaround gcc 4.2, 4.3 and 4.4 compilatin issue 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vaddq_f32(pset1<Packet4f>(a), countdown); 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vaddq_s32(pset1<Packet4i>(a), countdown); 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f inv, restep, div; 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON does not offer a divide instruction, we have to do a reciprocal approximation 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // a reciprocal estimate AND a reciprocal step -which saves a few instructions 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Newton-Raphson and vrecpsq_f32() 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inv = vrecpeq_f32(b); 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // This returns a differential, by which we will have to multiply inv to get a better 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // approximation of 1/b. 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath restep = vrecpsq_f32(b, inv); 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inv = vmulq_f32(restep, inv); 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Finally, multiply a by 1/b and get the wanted result of the division. 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath div = vmulq_f32(a, inv); 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return div; 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ eigen_assert(false && "packet integer division are not supported by NEON"); 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pset1<Packet4i>(0); 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for some weird raisons, it has to be overloaded for packet of integers 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) 177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t lo, hi; 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath lo = vdup_n_f32(*from); 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath hi = vdup_n_f32(*(from+1)); 193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_f32(lo, hi); 194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t lo, hi; 198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath lo = vdup_n_s32(*from); 199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath hi = vdup_n_s32(*(from+1)); 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_s32(lo, hi); 201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); } 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { __pld(addr); } 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME only store the 2 first elements ? 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi; 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f a_r64; 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_r64 = vrev64q_f32(a); 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a_r64); 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a_r64); 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_f32(a_hi, a_lo); 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi; 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i a_r64; 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_r64 = vrev64q_s32(a); 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a_r64); 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a_r64); 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_s32(a_hi, a_lo); 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, sum; 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float s[2]; 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_f32(a_lo, a_hi); 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_f32(sum, sum); 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_f32(s, sum); 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x4x2_t vtrn1, vtrn2, res1, res2; 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f sum1, sum2, sum; 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON zip performs interleaving of the supplied vectors. 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We perform two interleaves in a row to acquire the transposed vector 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn1 = vzipq_f32(vecs[0], vecs[2]); 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn2 = vzipq_f32(vecs[1], vecs[3]); 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the addition of the resulting vectors 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum1 = vaddq_f32(res1.val[0], res1.val[1]); 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum2 = vaddq_f32(res2.val[0], res2.val[1]); 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vaddq_f32(sum1, sum2); 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum; 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, sum; 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32_t s[2]; 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_s32(a_lo, a_hi); 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_s32(sum, sum); 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_s32(s, sum); 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x4x2_t vtrn1, vtrn2, res1, res2; 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i sum1, sum2, sum; 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON zip performs interleaving of the supplied vectors. 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We perform two interleaves in a row to acquire the transposed vector 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn1 = vzipq_s32(vecs[0], vecs[2]); 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn2 = vzipq_s32(vecs[1], vecs[3]); 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the addition of the resulting vectors 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum1 = vaddq_s32(res1.val[0], res1.val[1]); 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum2 = vaddq_s32(res2.val[0], res2.val[1]); 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vaddq_s32(sum1, sum2); 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum; 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Other reduction functions: 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// mul 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, prod; 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float s[2]; 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get a_lo = |a1|a2| and a_hi = |a3|a4| 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_f32(a_lo, a_hi); 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Multiply prod with its swapped value |a2*a4|a1*a3| 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_f32(prod, vrev64_f32(prod)); 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_f32(s, prod); 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, prod; 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32_t s[2]; 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get a_lo = |a1|a2| and a_hi = |a3|a4| 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_s32(a_lo, a_hi); 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Multiply prod with its swapped value |a2*a4|a1*a3| 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_s32(prod, vrev64_s32(prod)); 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_s32(s, prod); 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// min 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, min; 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float s[2]; 345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_f32(a_lo, a_hi); 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_f32(min, min); 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_f32(s, min); 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, min; 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32_t s[2]; 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_s32(a_lo, a_hi); 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_s32(min, min); 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_s32(s, min); 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// max 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, max; 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float s[2]; 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_f32(a_lo, a_hi); 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_f32(max, max); 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_f32(s, max); 379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, max; 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32_t s[2]; 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_s32(a_lo, a_hi); 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_s32(max, max); 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vst1_s32(s, max); 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return s[0]; 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, 397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define PALIGN_NEON(Offset,Type,Command) \ 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<>\ 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Type>\ 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{\ 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath {\ 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0)\ 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = Command(first, second, Offset);\ 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }\ 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath};\ 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4f,vextq_f32) 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4f,vextq_f32) 411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4f,vextq_f32) 412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4f,vextq_f32) 413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4i,vextq_s32) 414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4i,vextq_s32) 415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4i,vextq_s32) 416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4i,vextq_s32) 417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#undef PALIGN_NEON 419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_PACKET_MATH_NEON_H 425