1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr> 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_GEOMETRY_SSE_H 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_GEOMETRY_SSE_H 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<class Derived, class OtherDerived> 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned> 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b) 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000)); 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Quaternion<float> res; 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 a = _a.coeffs().template packet<Aligned>(0); 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 b = _b.coeffs().template packet<Aligned>(0); 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2), 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec4f_swizzle1(b,2,0,1,2)),mask); 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1), 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec4f_swizzle1(b,0,1,2,1)),mask); 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.x(), 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath vec4f_swizzle1(b,1,2,0,0))), 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _mm_add_ps(flip1,flip2))); 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return res; 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename VectorLhs,typename VectorRhs> 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true> 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static inline typename plain_matrix_type<VectorLhs>::type 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath run(const VectorLhs& lhs, const VectorRhs& rhs) 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0); 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0); 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename plain_matrix_type<VectorLhs>::type res; 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.x(),_mm_sub_ps(mul1,mul2)); 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return res; 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<class Derived, class OtherDerived> 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned> 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b) 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Quaternion<double> res; 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const double* a = _a.coeffs().data(); 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d b_xy = _b.coeffs().template packet<Aligned>(0); 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d b_zw = _b.coeffs().template packet<Aligned>(2); 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d a_xx = pset1<Packet2d>(a[0]); 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d a_yy = pset1<Packet2d>(a[1]); 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d a_zz = pset1<Packet2d>(a[2]); 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d a_ww = pset1<Packet2d>(a[3]); 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // two temporaries: 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet2d t1, t2; 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * t1 = ww*xy + yy*zw 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * t2 = zz*xy - xx*zw 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * res.xy = t1 +/- swap(t2) 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifdef EIGEN_VECTORIZE_SSE3 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_UNUSED_VARIABLE(mask) 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2))); 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#else 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.x(), padd(t1, pxor(mask,preverse(t2)))); 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * t1 = ww*zw - yy*xy 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * t2 = zz*zw + xx*xy 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifdef EIGEN_VECTORIZE_SSE3 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_UNUSED_VARIABLE(mask) 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#else 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pstore(&res.z(), psub(t1, pxor(mask,preverse(t2)))); 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return res; 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_GEOMETRY_SSE_H 116