111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Optimizations for random number functions, x86 version -*- C++ -*-
211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Copyright (C) 2012-2014 Free Software Foundation, Inc.
411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert//
511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This file is part of the GNU ISO C++ Library.  This library is free
611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// software; you can redistribute it and/or modify it under the
711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// terms of the GNU General Public License as published by the
811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Free Software Foundation; either version 3, or (at your option)
911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// any later version.
1011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
1111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This library is distributed in the hope that it will be useful,
1211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// but WITHOUT ANY WARRANTY; without even the implied warranty of
1311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// GNU General Public License for more details.
1511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
1611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Under Section 7 of GPL version 3, you are granted additional
1711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// permissions described in the GCC Runtime Library Exception, version
1811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// 3.1, as published by the Free Software Foundation.
1911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
2011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// You should have received a copy of the GNU General Public License and
2111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// a copy of the GCC Runtime Library Exception along with this program;
2211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
2311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// <http://www.gnu.org/licenses/>.
2411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
2511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert/** @file bits/opt_random.h
2611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *  This is an internal header file, included by other library headers.
2711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *  Do not attempt to use it directly. @headername{random}
2811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert */
2911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#ifndef _BITS_OPT_RANDOM_H
3111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#define _BITS_OPT_RANDOM_H 1
3211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <x86intrin.h>
3411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#pragma GCC system_header
3711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
3911cd02dfb91661c65134cac258cf5924270e9d2Dan Albertnamespace std _GLIBCXX_VISIBILITY(default)
4011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert{
4111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert_GLIBCXX_BEGIN_NAMESPACE_VERSION
4211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
4311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#ifdef __SSE3__
4411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert  template<>
4511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert    template<typename _UniformRandomNumberGenerator>
4611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert      void
4711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert      normal_distribution<double>::
4811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert      __generate(typename normal_distribution<double>::result_type* __f,
4911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		 typename normal_distribution<double>::result_type* __t,
5011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		 _UniformRandomNumberGenerator& __urng,
5111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		 const param_type& __param)
5211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert      {
5311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	typedef uint64_t __uctype;
5411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
5511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	if (__f == __t)
5611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	  return;
5711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
5811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	if (_M_saved_available)
5911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	  {
6011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    _M_saved_available = false;
6111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    *__f++ = _M_saved * __param.stddev() + __param.mean();
6211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
6311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    if (__f == __t)
6411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	      return;
6511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	  }
6611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
6711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	constexpr uint64_t __maskval = 0xfffffffffffffull;
6811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	static const __m128i __mask = _mm_set1_epi64x(__maskval);
6911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull);
7011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	static const __m128d __three = _mm_set1_pd(3.0);
7111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	const __m128d __av = _mm_set1_pd(__param.mean());
7211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
7311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	const __uctype __urngmin = __urng.min();
7411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	const __uctype __urngmax = __urng.max();
7511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	const __uctype __urngrange = __urngmax - __urngmin;
7611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	const __uctype __uerngrange = __urngrange + 1;
7711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
7811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	while (__f + 1 < __t)
7911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	  {
8011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    double __le;
8111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    __m128d __x;
8211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	    do
8311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert	      {
8411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                union
8511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                {
8611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                  __m128i __i;
8711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                  __m128d __d;
8811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		} __v;
8911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
9011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		if (__urngrange > __maskval)
9111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  {
9211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    if (__detail::_Power_of_2(__uerngrange))
9311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(),
9411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert							     __urng()),
9511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert					      __mask);
9611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    else
9711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      {
9811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			const __uctype __uerange = __maskval + 1;
9911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			const __uctype __scaling = __urngrange / __uerange;
10011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			const __uctype __past = __uerange * __scaling;
10111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			uint64_t __v1;
10211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			do
10311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  __v1 = __uctype(__urng()) - __urngmin;
10411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			while (__v1 >= __past);
10511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__v1 /= __scaling;
10611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			uint64_t __v2;
10711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			do
10811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  __v2 = __uctype(__urng()) - __urngmin;
10911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			while (__v2 >= __past);
11011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__v2 /= __scaling;
11111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
11211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__v.__i = _mm_set_epi64x(__v1, __v2);
11311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      }
11411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  }
11511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		else if (__urngrange == __maskval)
11611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  __v.__i = _mm_set_epi64x(__urng(), __urng());
11711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		else if ((__urngrange + 2) * __urngrange >= __maskval
11811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			 && __detail::_Power_of_2(__uerngrange))
11911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  {
12011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    uint64_t __v1 = __urng() * __uerngrange + __urng();
12111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    uint64_t __v2 = __urng() * __uerngrange + __urng();
12211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
12311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2),
12411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert					    __mask);
12511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  }
12611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		else
12711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  {
12811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    size_t __nrng = 2;
12911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    __uctype __high = __maskval / __uerngrange / __uerngrange;
13011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    while (__high > __uerngrange)
13111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      {
13211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			++__nrng;
13311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__high /= __uerngrange;
13411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      }
13511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    const __uctype __highrange = __high + 1;
13611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    const __uctype __scaling = __urngrange / __highrange;
13711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    const __uctype __past = __highrange * __scaling;
13811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    __uctype __tmp;
13911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
14011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    uint64_t __v1;
14111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    do
14211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      {
14311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			do
14411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  __tmp = __uctype(__urng()) - __urngmin;
14511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			while (__tmp >= __past);
14611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__v1 = __tmp / __scaling;
14711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
14811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  {
14911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __tmp = __v1;
15011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __v1 *= __uerngrange;
15111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __v1 += __uctype(__urng()) - __urngmin;
15211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  }
15311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      }
15411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    while (__v1 > __maskval || __v1 < __tmp);
15511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
15611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    uint64_t __v2;
15711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    do
15811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      {
15911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			do
16011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  __tmp = __uctype(__urng()) - __urngmin;
16111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			while (__tmp >= __past);
16211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			__v2 = __tmp / __scaling;
16311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
16411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  {
16511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __tmp = __v2;
16611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __v2 *= __uerngrange;
16711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			    __v2 += __uctype(__urng()) - __urngmin;
16811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert			  }
16911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		      }
17011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    while (__v2 > __maskval || __v2 < __tmp);
17111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
17211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		    __v.__i = _mm_set_epi64x(__v1, __v2);
17311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		  }
17411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
17511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		__v.__i = _mm_or_si128(__v.__i, __two);
17611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		__x = _mm_sub_pd(__v.__d, __three);
17711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		__m128d __m = _mm_mul_pd(__x, __x);
17811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m));
17911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert              }
18011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            while (__le == 0.0 || __le >= 1.0);
18111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
18211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            double __mult = (std::sqrt(-2.0 * std::log(__le) / __le)
18311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                             * __param.stddev());
18411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
18511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av);
18611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
18711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            _mm_storeu_pd(__f, __x);
18811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            __f += 2;
18911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert          }
19011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
19111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert        if (__f != __t)
19211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert          {
19311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            result_type __x, __y, __r2;
19411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
19511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            __detail::_Adaptor<_UniformRandomNumberGenerator, result_type>
19611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert              __aurng(__urng);
19711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
19811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            do
19911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert              {
20011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                __x = result_type(2.0) * __aurng() - 1.0;
20111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                __y = result_type(2.0) * __aurng() - 1.0;
20211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert                __r2 = __x * __x + __y * __y;
20311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert              }
20411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            while (__r2 > 1.0 || __r2 == 0.0);
20511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
20611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2);
20711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            _M_saved = __x * __mult;
20811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            _M_saved_available = true;
20911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert            *__f = __y * __mult * __param.stddev() + __param.mean();
21011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert          }
21111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert      }
21211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#endif
21311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
21411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
21511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert_GLIBCXX_END_NAMESPACE_VERSION
21611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert} // namespace
21711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
21811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert
21911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#endif // _BITS_OPT_RANDOM_H
220