1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr> 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Heavily based on Gael's SSE version. 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_PACKET_MATH_NEON_H 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_PACKET_MATH_NEON_H 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME NEON has 16 quad registers, but since the current register allocator 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// is so bad, it is much better to reduce it to 8 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef float32x4_t Packet4f; 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef int32x4_t Packet4i; 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef uint32x4_t Packet4ui; 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4f p4f_##NAME = pset1<Packet4f>(X) 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet4i p4i_##NAME = pset1<Packet4i>(X) 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if defined(__llvm__) && !defined(__clang__) 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Special treatment for Apple's llvm-gcc, its NEON packet types are unions 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}} 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}} 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#else 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Default initializer for packets 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y} 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W} 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef __pld 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<float> : default_packet_traits 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4f type; 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size = 4, 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasDiv = 1, 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSin = 0, 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasCos = 0, 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasLog = 0, 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasExp = 0, 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath HasSqrt = 0 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<int> : default_packet_traits 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Packet4i type; 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Vectorizable = 1, 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath AlignedOnScalar = 1, 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath size=4 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // FIXME check the Has* 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__) 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// workaround gcc 4.2, 4.3 and 4.4 compilatin issue 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vaddq_f32(pset1<Packet4f>(a), countdown); 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vaddq_s32(pset1<Packet4i>(a), countdown); 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 1187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 1197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 1207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f inv, restep, div; 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON does not offer a divide instruction, we have to do a reciprocal approximation 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // a reciprocal estimate AND a reciprocal step -which saves a few instructions 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Newton-Raphson and vrecpsq_f32() 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inv = vrecpeq_f32(b); 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // This returns a differential, by which we will have to multiply inv to get a better 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // approximation of 1/b. 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath restep = vrecpsq_f32(b, inv); 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inv = vmulq_f32(restep, inv); 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Finally, multiply a by 1/b and get the wanted result of the division. 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath div = vmulq_f32(a, inv); 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return div; 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ eigen_assert(false && "packet integer division are not supported by NEON"); 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return pset1<Packet4i>(0); 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for some weird raisons, it has to be overloaded for packet of integers 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } 178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t lo, hi; 1947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez lo = vld1_dup_f32(from); 1957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez hi = vld1_dup_f32(from+1); 196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_f32(lo, hi); 197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t lo, hi; 2017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez lo = vld1_dup_s32(from); 2027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez hi = vld1_dup_s32(from+1); 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_s32(lo, hi); 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); } 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { __pld(addr); } 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME only store the 2 first elements ? 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi; 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f a_r64; 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_r64 = vrev64q_f32(a); 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a_r64); 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a_r64); 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_f32(a_hi, a_lo); 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi; 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i a_r64; 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_r64 = vrev64q_s32(a); 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a_r64); 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a_r64); 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return vcombine_s32(a_hi, a_lo); 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, sum; 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_f32(a_lo, a_hi); 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_f32(sum, sum); 2487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_f32(sum, 0); 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x4x2_t vtrn1, vtrn2, res1, res2; 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f sum1, sum2, sum; 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON zip performs interleaving of the supplied vectors. 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We perform two interleaves in a row to acquire the transposed vector 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn1 = vzipq_f32(vecs[0], vecs[2]); 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn2 = vzipq_f32(vecs[1], vecs[3]); 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the addition of the resulting vectors 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum1 = vaddq_f32(res1.val[0], res1.val[1]); 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum2 = vaddq_f32(res2.val[0], res2.val[1]); 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vaddq_f32(sum1, sum2); 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum; 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, sum; 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_s32(a_lo, a_hi); 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vpadd_s32(sum, sum); 2797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_s32(sum, 0); 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x4x2_t vtrn1, vtrn2, res1, res2; 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i sum1, sum2, sum; 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // NEON zip performs interleaving of the supplied vectors. 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We perform two interleaves in a row to acquire the transposed vector 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn1 = vzipq_s32(vecs[0], vecs[2]); 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vtrn2 = vzipq_s32(vecs[1], vecs[3]); 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Do the addition of the resulting vectors 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum1 = vaddq_s32(res1.val[0], res1.val[1]); 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum2 = vaddq_s32(res2.val[0], res2.val[1]); 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sum = vaddq_s32(sum1, sum2); 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return sum; 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Other reduction functions: 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// mul 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, prod; 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get a_lo = |a1|a2| and a_hi = |a3|a4| 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_f32(a_lo, a_hi); 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Multiply prod with its swapped value |a2*a4|a1*a3| 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_f32(prod, vrev64_f32(prod)); 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_f32(prod, 0); 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, prod; 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get a_lo = |a1|a2| and a_hi = |a3|a4| 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_s32(a_lo, a_hi); 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Multiply prod with its swapped value |a2*a4|a1*a3| 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath prod = vmul_s32(prod, vrev64_s32(prod)); 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_s32(prod, 0); 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// min 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, min; 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_f32(a_lo, a_hi); 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_f32(min, min); 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_f32(min, 0); 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 3457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, min; 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_s32(a_lo, a_hi); 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath min = vpmin_s32(min, min); 3547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 3557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_s32(min, 0); 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// max 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath float32x2_t a_lo, a_hi, max; 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_f32(a); 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_f32(a); 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_f32(a_lo, a_hi); 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_f32(max, max); 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_f32(max, 0); 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 3707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int32x2_t a_lo, a_hi, max; 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_lo = vget_low_s32(a); 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a_hi = vget_high_s32(a); 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath max = vpmax_s32(a_lo, a_hi); 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return vget_lane_s32(max, 0); 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define PALIGN_NEON(Offset,Type,Command) \ 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<>\ 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Type>\ 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{\ 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath {\ 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (Offset!=0)\ 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath first = Command(first, second, Offset);\ 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }\ 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath};\ 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4f,vextq_f32) 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4f,vextq_f32) 397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4f,vextq_f32) 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4f,vextq_f32) 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4i,vextq_s32) 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4i,vextq_s32) 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4i,vextq_s32) 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4i,vextq_s32) 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#undef PALIGN_NEON 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_PACKET_MATH_NEON_H 411