111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Optimizations for random number functions, x86 version -*- C++ -*- 211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Copyright (C) 2012-2014 Free Software Foundation, Inc. 411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// 511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This file is part of the GNU ISO C++ Library. This library is free 611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// software; you can redistribute it and/or modify it under the 711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// terms of the GNU General Public License as published by the 811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Free Software Foundation; either version 3, or (at your option) 911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// any later version. 1011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 1111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This library is distributed in the hope that it will be useful, 1211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// but WITHOUT ANY WARRANTY; without even the implied warranty of 1311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// GNU General Public License for more details. 1511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 1611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Under Section 7 of GPL version 3, you are granted additional 1711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// permissions described in the GCC Runtime Library Exception, version 1811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// 3.1, as published by the Free Software Foundation. 1911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 2011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// You should have received a copy of the GNU General Public License and 2111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// a copy of the GCC Runtime Library Exception along with this program; 2211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 2311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// <http://www.gnu.org/licenses/>. 2411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 2511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert/** @file bits/opt_random.h 2611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * This is an internal header file, included by other library headers. 2711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * Do not attempt to use it directly. @headername{random} 2811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert */ 2911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#ifndef _BITS_OPT_RANDOM_H 3111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#define _BITS_OPT_RANDOM_H 1 3211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <x86intrin.h> 3411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#pragma GCC system_header 3711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3911cd02dfb91661c65134cac258cf5924270e9d2Dan Albertnamespace std _GLIBCXX_VISIBILITY(default) 4011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert{ 4111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert_GLIBCXX_BEGIN_NAMESPACE_VERSION 4211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 4311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#ifdef __SSE3__ 4411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert template<> 4511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert template<typename _UniformRandomNumberGenerator> 4611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert void 4711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert normal_distribution<double>:: 4811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __generate(typename normal_distribution<double>::result_type* __f, 4911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename normal_distribution<double>::result_type* __t, 5011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _UniformRandomNumberGenerator& __urng, 5111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const param_type& __param) 5211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 5311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef uint64_t __uctype; 5411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 5511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__f == __t) 5611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return; 5711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 5811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (_M_saved_available) 5911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 6011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _M_saved_available = false; 6111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__f++ = _M_saved * __param.stddev() + __param.mean(); 6211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 6311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__f == __t) 6411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return; 6511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 6611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 6711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert constexpr uint64_t __maskval = 0xfffffffffffffull; 6811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert static const __m128i __mask = _mm_set1_epi64x(__maskval); 6911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull); 7011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert static const __m128d __three = _mm_set1_pd(3.0); 7111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __m128d __av = _mm_set1_pd(__param.mean()); 7211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 7311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __urngmin = __urng.min(); 7411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __urngmax = __urng.max(); 7511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __urngrange = __urngmax - __urngmin; 7611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __uerngrange = __urngrange + 1; 7711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 7811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__f + 1 < __t) 7911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 8011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert double __le; 8111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __m128d __x; 8211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 8311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 8411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert union 8511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 8611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __m128i __i; 8711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __m128d __d; 8811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } __v; 8911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 9011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__urngrange > __maskval) 9111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 9211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__detail::_Power_of_2(__uerngrange)) 9311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(), 9411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __urng()), 9511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __mask); 9611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else 9711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 9811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __uerange = __maskval + 1; 9911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __scaling = __urngrange / __uerange; 10011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __past = __uerange * __scaling; 10111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v1; 10211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 10311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v1 = __uctype(__urng()) - __urngmin; 10411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__v1 >= __past); 10511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v1 /= __scaling; 10611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v2; 10711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 10811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v2 = __uctype(__urng()) - __urngmin; 10911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__v2 >= __past); 11011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v2 /= __scaling; 11111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 11211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_set_epi64x(__v1, __v2); 11311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 11411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 11511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else if (__urngrange == __maskval) 11611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_set_epi64x(__urng(), __urng()); 11711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else if ((__urngrange + 2) * __urngrange >= __maskval 11811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert && __detail::_Power_of_2(__uerngrange)) 11911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 12011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v1 = __urng() * __uerngrange + __urng(); 12111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v2 = __urng() * __uerngrange + __urng(); 12211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 12311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2), 12411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __mask); 12511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 12611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else 12711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 12811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert size_t __nrng = 2; 12911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __uctype __high = __maskval / __uerngrange / __uerngrange; 13011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__high > __uerngrange) 13111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 13211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ++__nrng; 13311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __high /= __uerngrange; 13411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 13511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __highrange = __high + 1; 13611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __scaling = __urngrange / __highrange; 13711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const __uctype __past = __highrange * __scaling; 13811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __uctype __tmp; 13911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 14011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v1; 14111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 14211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 14311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 14411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __tmp = __uctype(__urng()) - __urngmin; 14511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__tmp >= __past); 14611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v1 = __tmp / __scaling; 14711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) 14811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 14911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __tmp = __v1; 15011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v1 *= __uerngrange; 15111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v1 += __uctype(__urng()) - __urngmin; 15211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 15311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 15411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__v1 > __maskval || __v1 < __tmp); 15511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 15611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert uint64_t __v2; 15711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 15811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 15911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 16011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __tmp = __uctype(__urng()) - __urngmin; 16111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__tmp >= __past); 16211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v2 = __tmp / __scaling; 16311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) 16411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 16511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __tmp = __v2; 16611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v2 *= __uerngrange; 16711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v2 += __uctype(__urng()) - __urngmin; 16811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 16911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 17011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__v2 > __maskval || __v2 < __tmp); 17111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_set_epi64x(__v1, __v2); 17311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 17411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __v.__i = _mm_or_si128(__v.__i, __two); 17611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __x = _mm_sub_pd(__v.__d, __three); 17711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __m128d __m = _mm_mul_pd(__x, __x); 17811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m)); 17911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 18011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__le == 0.0 || __le >= 1.0); 18111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 18211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert double __mult = (std::sqrt(-2.0 * std::log(__le) / __le) 18311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * __param.stddev()); 18411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 18511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av); 18611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 18711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _mm_storeu_pd(__f, __x); 18811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __f += 2; 18911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 19011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__f != __t) 19211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 19311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert result_type __x, __y, __r2; 19411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __detail::_Adaptor<_UniformRandomNumberGenerator, result_type> 19611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __aurng(__urng); 19711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert do 19911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 20011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __x = result_type(2.0) * __aurng() - 1.0; 20111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __y = result_type(2.0) * __aurng() - 1.0; 20211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __r2 = __x * __x + __y * __y; 20311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 20411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__r2 > 1.0 || __r2 == 0.0); 20511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 20611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2); 20711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _M_saved = __x * __mult; 20811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _M_saved_available = true; 20911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__f = __y * __mult * __param.stddev() + __param.mean(); 21011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 21111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 21211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#endif 21311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert_GLIBCXX_END_NAMESPACE_VERSION 21611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert} // namespace 21711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#endif // _BITS_OPT_RANDOM_H 220