PacketMath.h revision c981c48f5bc9aefeffc0bcb0cc3934c2fae179dd
1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library
2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra.
3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Heavily based on Gael's SSE version.
7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla
9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed
10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_PACKET_MATH_NEON_H
13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_PACKET_MATH_NEON_H
14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen {
16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal {
18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME NEON has 16 quad registers, but since the current register allocator
24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// is so bad, it is much better to reduce it to 8
25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef float32x4_t Packet4f;
30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef int32x4_t   Packet4i;
31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef uint32x4_t  Packet4ui;
32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if defined(__llvm__) && !defined(__clang__)
43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#else
47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  //Default initializer for packets
48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef __pld
53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define __pld(x) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<float>  : default_packet_traits
57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef Packet4f type;
59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  enum {
60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Vectorizable = 1,
61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    AlignedOnScalar = 1,
62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    size = 4,
63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasDiv  = 1,
65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    // FIXME check the Has*
66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasSin  = 0,
67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasCos  = 0,
68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasLog  = 0,
69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasExp  = 0,
70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    HasSqrt = 0
71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  };
72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath};
73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct packet_traits<int>    : default_packet_traits
74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef Packet4i type;
76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  enum {
77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Vectorizable = 1,
78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    AlignedOnScalar = 1,
79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    size=4
80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    // FIXME check the Has*
81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  };
82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath};
83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vaddq_f32(pset1<Packet4f>(a), countdown);
102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vaddq_s32(pset1<Packet4i>(a), countdown);
107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4f inv, restep, div;
124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Newton-Raphson and vrecpsq_f32()
130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  inv = vrecpeq_f32(b);
131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // This returns a differential, by which we will have to multiply inv to get a better
133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // approximation of 1/b.
134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  restep = vrecpsq_f32(b, inv);
135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  inv = vmulq_f32(restep, inv);
136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Finally, multiply a by 1/b and get the wanted result of the division.
138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  div = vmulq_f32(a, inv);
139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return div;
141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ eigen_assert(false && "packet integer division are not supported by NEON");
144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return pset1<Packet4i>(0);
145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for some weird raisons, it has to be overloaded for packet of integers
148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t lo, hi;
191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  lo = vdup_n_f32(*from);
192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  hi = vdup_n_f32(*(from+1));
193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vcombine_f32(lo, hi);
194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t lo, hi;
198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  lo = vdup_n_s32(*from);
199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  hi = vdup_n_s32(*(from+1));
200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vcombine_s32(lo, hi);
201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); }
210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { __pld(addr); }
211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// FIXME only store the 2 first elements ?
213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t a_lo, a_hi;
218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4f a_r64;
219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_r64 = vrev64q_f32(a);
221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_f32(a_r64);
222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_f32(a_r64);
223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vcombine_f32(a_hi, a_lo);
224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t a_lo, a_hi;
227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4i a_r64;
228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_r64 = vrev64q_s32(a);
230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_s32(a_r64);
231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_s32(a_r64);
232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return vcombine_s32(a_hi, a_lo);
233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t a_lo, a_hi, sum;
240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float s[2];
241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_f32(a);
243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_f32(a);
244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vpadd_f32(a_lo, a_hi);
245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vpadd_f32(sum, sum);
246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_f32(s, sum);
247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x4x2_t vtrn1, vtrn2, res1, res2;
254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4f sum1, sum2, sum;
255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // NEON zip performs interleaving of the supplied vectors.
257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // We perform two interleaves in a row to acquire the transposed vector
258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Do the addition of the resulting vectors
264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vaddq_f32(sum1, sum2);
267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return sum;
269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t a_lo, a_hi, sum;
274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32_t s[2];
275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_s32(a);
277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_s32(a);
278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vpadd_s32(a_lo, a_hi);
279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vpadd_s32(sum, sum);
280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_s32(s, sum);
281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x4x2_t vtrn1, vtrn2, res1, res2;
288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Packet4i sum1, sum2, sum;
289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // NEON zip performs interleaving of the supplied vectors.
291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // We perform two interleaves in a row to acquire the transposed vector
292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Do the addition of the resulting vectors
298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sum = vaddq_s32(sum1, sum2);
301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return sum;
303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Other reduction functions:
306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// mul
307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t a_lo, a_hi, prod;
310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float s[2];
311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_f32(a);
314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_f32(a);
315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  prod = vmul_f32(a_lo, a_hi);
317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Multiply prod with its swapped value |a2*a4|a1*a3|
318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  prod = vmul_f32(prod, vrev64_f32(prod));
319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_f32(s, prod);
320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t a_lo, a_hi, prod;
326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32_t s[2];
327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_s32(a);
330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_s32(a);
331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  prod = vmul_s32(a_lo, a_hi);
333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // Multiply prod with its swapped value |a2*a4|a1*a3|
334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  prod = vmul_s32(prod, vrev64_s32(prod));
335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_s32(s, prod);
336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// min
341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t a_lo, a_hi, min;
344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float s[2];
345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_f32(a);
347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_f32(a);
348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  min = vpmin_f32(a_lo, a_hi);
349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  min = vpmin_f32(min, min);
350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_f32(s, min);
351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t a_lo, a_hi, min;
357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32_t s[2];
358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_s32(a);
360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_s32(a);
361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  min = vpmin_s32(a_lo, a_hi);
362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  min = vpmin_s32(min, min);
363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_s32(s, min);
364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// max
369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float32x2_t a_lo, a_hi, max;
372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  float s[2];
373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_f32(a);
375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_f32(a);
376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  max = vpmax_f32(a_lo, a_hi);
377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  max = vpmax_f32(max, max);
378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_f32(s, max);
379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32x2_t a_lo, a_hi, max;
385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int32_t s[2];
386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_lo = vget_low_s32(a);
388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  a_hi = vget_high_s32(a);
389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  max = vpmax_s32(a_lo, a_hi);
390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  max = vpmax_s32(max, max);
391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  vst1_s32(s, max);
392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s[0];
394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define PALIGN_NEON(Offset,Type,Command) \
399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<>\
400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct palign_impl<Offset,Type>\
401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{\
402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {\
404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        if (Offset!=0)\
405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            first = Command(first, second, Offset);\
406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }\
407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath};\
408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4f,vextq_f32)
410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4f,vextq_f32)
411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4f,vextq_f32)
412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4f,vextq_f32)
413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(0,Packet4i,vextq_s32)
414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(1,Packet4i,vextq_s32)
415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(2,Packet4i,vextq_s32)
416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPALIGN_NEON(3,Packet4i,vextq_s32)
417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#undef PALIGN_NEON
419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal
421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen
423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_PACKET_MATH_NEON_H
425