1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2007 Julien Pommier 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* The sin, cos, exp, and log functions of this file come from 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_MATH_FUNCTIONS_SSE_H 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_MATH_FUNCTIONS_SSE_H 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPacket4f plog<Packet4f>(const Packet4f& _x) 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x = _x; 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* the smallest non denormalized float number */ 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); 347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); 352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* natural logarithm computed for 4 simultaneous float 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return NaN for x <= 0 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i emm0; 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 55615d816d068b4d0f5e8df601930b5f160bf7eda1Tim Murray Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN 567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* keep only the fractional part */ 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = _mm_and_ps(x, p4f_inv_mant_mask); 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = _mm_or_ps(x, p4f_half); 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_sub_epi32(emm0, p4i_0x7f); 662b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* part2: 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if( x < SQRTHF ) { 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath e -= 1; 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = x + x - 1.0; 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } else { x = x - 1.0; } 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); 752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f tmp = pand(x, mask); 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = psub(x, p4f_1); 772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang e = psub(e, pand(p4f_1, mask)); 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, tmp); 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x2 = pmul(x,x); 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x3 = pmul(x2,x); 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f y, y1, y2; 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y , x, p4f_cephes_log_p2); 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y1 = pmadd(y1, x, p4f_cephes_log_p5); 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, x, p4f_cephes_log_p8); 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x3, y1); 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x3, y2); 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(y, x3); 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y1 = pmul(e, p4f_cephes_log_q1); 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath tmp = pmul(x2, p4f_half); 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = padd(y, y1); 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = psub(x, tmp); 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmul(e, p4f_cephes_log_q2); 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, y); 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, y2); 1017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // negative arg will be NAN, 0 will be -INF 1027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), 1037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _mm_and_ps(iszero_mask, p4f_minus_inf)); 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPacket4f pexp<Packet4f>(const Packet4f& _x) 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x = _x; 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 1292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f tmp, fx; 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i emm0; 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // clamp x 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* express exp(x) as exp(g + n*log(2)) */ 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 1387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#ifdef EIGEN_VECTORIZE_SSE4_1 1397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez fx = _mm_floor_ps(fx); 1407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#else 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_cvttps_epi32(fx); 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath tmp = _mm_cvtepi32_ps(emm0); 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* if greater, substract 1 */ 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f mask = _mm_cmpgt_ps(tmp, fx); 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath mask = _mm_and_ps(mask, p4f_1); 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath fx = psub(tmp, mask); 1477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#endif 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath tmp = pmul(fx, p4f_cephes_exp_C1); 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f z = pmul(fx, p4f_cephes_exp_C2); 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = psub(x, tmp); 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = psub(x, z); 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath z = pmul(x,x); 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f y = p4f_cephes_exp_p0; 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x, p4f_cephes_exp_p1); 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x, p4f_cephes_exp_p2); 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x, p4f_cephes_exp_p3); 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x, p4f_cephes_exp_p4); 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, x, p4f_cephes_exp_p5); 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, z, x); 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = padd(y, p4f_1); 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // build 2^n 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_cvttps_epi32(fx); 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_add_epi32(emm0, p4i_0x7f); 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_slli_epi32(emm0, 23); 169615d816d068b4d0f5e8df601930b5f160bf7eda1Tim Murray return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 1717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 1727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos HernandezPacket2d pexp<Packet2d>(const Packet2d& _x) 1737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez{ 1747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d x = _x; 1757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); 1777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); 1787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); 1797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1807faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); 1817faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); 1827faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1837faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); 1847faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1857faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); 1867faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); 1877faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); 1887faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1897faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); 1907faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); 1917faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); 1927faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); 1937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); 1957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); 1967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); 1977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet2d tmp, fx; 1997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet4i emm0; 2007faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // clamp x 2027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); 2037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez /* express exp(x) as exp(g + n*log(2)) */ 2047faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); 2057faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#ifdef EIGEN_VECTORIZE_SSE4_1 2077faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez fx = _mm_floor_pd(fx); 2087faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#else 2097faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez emm0 = _mm_cvttpd_epi32(fx); 2107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez tmp = _mm_cvtepi32_pd(emm0); 2117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez /* if greater, substract 1 */ 2127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d mask = _mm_cmpgt_pd(tmp, fx); 2137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez mask = _mm_and_pd(mask, p2d_1); 2147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez fx = psub(tmp, mask); 2157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#endif 2167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez tmp = pmul(fx, p2d_cephes_exp_C1); 2187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d z = pmul(fx, p2d_cephes_exp_C2); 2197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez x = psub(x, tmp); 2207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez x = psub(x, z); 2217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d x2 = pmul(x,x); 2237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d px = p2d_cephes_exp_p0; 2257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez px = pmadd(px, x2, p2d_cephes_exp_p1); 2267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez px = pmadd(px, x2, p2d_cephes_exp_p2); 2277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez px = pmul (px, x); 2287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet2d qx = p2d_cephes_exp_q0; 2307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez qx = pmadd(qx, x2, p2d_cephes_exp_q1); 2317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez qx = pmadd(qx, x2, p2d_cephes_exp_q2); 2327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez qx = pmadd(qx, x2, p2d_cephes_exp_q3); 2337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez x = pdiv(px,psub(qx,px)); 2357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez x = pmadd(p2d_2,x,p2d_1); 2367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // build 2^n 2387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez emm0 = _mm_cvttpd_epi32(fx); 2397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez emm0 = _mm_add_epi32(emm0, p4i_1023_0); 2407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez emm0 = _mm_slli_epi32(emm0, 20); 2417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); 242615d816d068b4d0f5e8df601930b5f160bf7eda1Tim Murray return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); 2437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* evaluation of 4 sines at onces, using SSE2 intrinsics. 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath The code is the exact rewriting of the cephes sinf function. 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Precision is excellent as long as x < 8192 (I did not bother to 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath take into account the special handling they have for greater values 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath -- it does not return garbage for arguments over 8192, though, but 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath the extra precision is missing). 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath surprising but correct result. 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath*/ 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPacket4f psin<Packet4f>(const Packet4f& _x) 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x = _x; 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(1, 1); 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(2, 2); 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(4, 4); 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 2822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f xmm1, xmm2, xmm3, sign_bit, y; 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i emm0, emm2; 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sign_bit = x; 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* take the absolute value */ 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = pabs(x); 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* take the modulo */ 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* extract the sign bit (upper one) */ 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* scale by 4/Pi */ 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(x, p4f_cephes_FOPI); 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* store the integer part of y in mm0 */ 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_cvttps_epi32(y); 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* j=(j+1) & (~1) (see the cephes sources) */ 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_add_epi32(emm2, p4i_1); 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_and_si128(emm2, p4i_not1); 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_cvtepi32_ps(emm2); 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* get the swap sign flag */ 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_and_si128(emm2, p4i_4); 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_slli_epi32(emm0, 29); 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* get the polynom selection mask 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath there is one polynom for 0 <= x <= Pi/4 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath and another one for Pi/4<x<=Pi/2 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Both branches will be computed. 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_and_si128(emm2, p4i_2); 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f swap_sign_bit = _mm_castsi128_ps(emm0); 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f poly_mask = _mm_castsi128_ps(emm2); 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* The magic pass: "Extended precision modular arithmetic" 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = ((x - y * DP1) - y * DP2) - y * DP3; */ 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm1 = pmul(y, p4f_minus_cephes_DP1); 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm2 = pmul(y, p4f_minus_cephes_DP2); 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm3 = pmul(y, p4f_minus_cephes_DP3); 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm1); 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm2); 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm3); 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* Evaluate the first polynom (0 <= x <= Pi/4) */ 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = p4f_coscof_p0; 330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f z = _mm_mul_ps(x,x); 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, z, p4f_coscof_p1); 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y, z, p4f_coscof_p2); 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(y, z); 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(y, z); 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f tmp = pmul(z, p4f_half); 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = psub(y, tmp); 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = padd(y, p4f_1); 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* Evaluate the second polynom (Pi/4 <= x <= 0) */ 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f y2 = p4f_sincof_p0; 343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, z, p4f_sincof_p1); 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, z, p4f_sincof_p2); 345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmul(y2, z); 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmul(y2, x); 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = padd(y2, x); 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* select the correct result from the two polynoms */ 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = _mm_and_ps(poly_mask, y2); 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_andnot_ps(poly_mask, y); 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_or_ps(y,y2); 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* update the sign */ 354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return _mm_xor_ps(y, sign_bit); 355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* almost the same as psin */ 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPacket4f pcos<Packet4f>(const Packet4f& _x) 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f x = _x; 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(1, 1); 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(2, 2); 368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4i(4, 4); 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); 379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f xmm1, xmm2, xmm3, y; 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4i emm0, emm2; 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = pabs(x); 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* scale by 4/Pi */ 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(x, p4f_cephes_FOPI); 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* get the integer part of y */ 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_cvttps_epi32(y); 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* j=(j+1) & (~1) (see the cephes sources) */ 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_add_epi32(emm2, p4i_1); 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_and_si128(emm2, p4i_not1); 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_cvtepi32_ps(emm2); 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_sub_epi32(emm2, p4i_2); 397c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* get the swap sign flag */ 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_andnot_si128(emm2, p4i_4); 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm0 = _mm_slli_epi32(emm0, 29); 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* get the polynom selection mask */ 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_and_si128(emm2, p4i_2); 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f sign_bit = _mm_castsi128_ps(emm0); 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f poly_mask = _mm_castsi128_ps(emm2); 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* The magic pass: "Extended precision modular arithmetic" 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = ((x - y * DP1) - y * DP2) - y * DP3; */ 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm1 = pmul(y, p4f_minus_cephes_DP1); 411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm2 = pmul(y, p4f_minus_cephes_DP2); 412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath xmm3 = pmul(y, p4f_minus_cephes_DP3); 413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm1); 414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm2); 415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = padd(x, xmm3); 416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* Evaluate the first polynom (0 <= x <= Pi/4) */ 418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = p4f_coscof_p0; 419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f z = pmul(x,x); 420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y,z,p4f_coscof_p1); 422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmadd(y,z,p4f_coscof_p2); 423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(y, z); 424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = pmul(y, z); 425c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f tmp = _mm_mul_ps(z, p4f_half); 426c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = psub(y, tmp); 427c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = padd(y, p4f_1); 428c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 429c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* Evaluate the second polynom (Pi/4 <= x <= 0) */ 430c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f y2 = p4f_sincof_p0; 431c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, z, p4f_sincof_p1); 432c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, z, p4f_sincof_p2); 433c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmul(y2, z); 434c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = pmadd(y2, x, x); 435c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 436c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* select the correct result from the two polynoms */ 437c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y2 = _mm_and_ps(poly_mask, y2); 438c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_andnot_ps(poly_mask, y); 439c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath y = _mm_or_ps(y,y2); 440c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 441c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /* update the sign */ 442c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return _mm_xor_ps(y, sign_bit); 443c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 444c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 4457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#if EIGEN_FAST_MATH 4467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 4472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Functions for sqrt. 4482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step 4492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// of Newton's method, at a cost of 1-2 bits of precision as opposed to the 4502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// exact solution. It does not handle +inf, or denormalized numbers correctly. 4512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// The main advantage of this approach is not just speed, but also the fact that 4522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// it can be inlined and pipelined with other computations, further reducing its 4532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// effective latency. This is similar to Quake3's fast inverse square root. 454c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// For detail see here: http://www.beyond3d.com/content/articles/8/ 455c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 456c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathPacket4f psqrt<Packet4f>(const Packet4f& _x) 457c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 458c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Packet4f half = pmul(_x, pset1<Packet4f>(.5f)); 4592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f denormal_mask = _mm_and_ps( 4602b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm_cmpge_ps(_x, _mm_setzero_ps()), 4612b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()))); 462c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 4632b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Compute approximate reciprocal sqrt. 4642b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f x = _mm_rsqrt_ps(_x); 4652b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Do a single step of Newton's iteration. 466c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x)))); 4672b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Flush results for denormals to zero. 4682b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_andnot_ps(denormal_mask, pmul(_x,x)); 469c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 470c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 4717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#else 4727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 4732b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 4742b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } 4752b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4762b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 4772b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4782b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 4792b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } 4802b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4812b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if EIGEN_FAST_MATH 4822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 4842b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket4f prsqrt<Packet4f>(const Packet4f& _x) { 4852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); 4862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); 4872b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); 4882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); 4892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); 4902b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f neg_half = pmul(_x, p4f_minus_half); 4922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4932b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // select only the inverse sqrt of positive normal inputs (denormals are 4942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // flushed to zero and cause infs as well). 4952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); 4962b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); 4972b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 4982b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Fill in NaNs and Infs for the negative/zero entries. 4992b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); 5002b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); 5012b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), 5022b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang _mm_and_ps(zero_mask, p4f_inf)); 5032b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Do a single step of Newton's iteration. 5052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); 5062b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5072b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Insert NaNs and Infs in all the right places. 5082b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_or_ps(x, infs_and_nans); 5092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5102b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5112b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 5122b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5132b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 5142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket4f prsqrt<Packet4f>(const Packet4f& x) { 5152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. 5162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x)); 5172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 5197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#endif 5207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 5212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 5222b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket2d prsqrt<Packet2d>(const Packet2d& x) { 5232b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. 5242b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x)); 5252b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang// Hyperbolic Tangent function. 5282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate <> 5292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f 5302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangptanh<Packet4f>(const Packet4f& x) { 5312b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return internal::generic_fast_tanh_float(x); 5322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 534c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 535c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 5362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace numext { 5372b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> 5392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE 5402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangfloat sqrt(const float &x) 5412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 5422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); 5432b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<> 5462b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE 5472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangdouble sqrt(const double &x) 5482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{ 5492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#if EIGEN_COMP_GNUC_STRICT 5502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // This works around a GCC bug generating poor code for _mm_sqrt_pd 5512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b 5522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); 5532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#else 5542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); 5552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang#endif 5562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} 5572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 5582b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang} // end namespace numex 5592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang 560c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 561c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 562c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_MATH_FUNCTIONS_SSE_H 563