1/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __PMMINTRIN_H 25#define __PMMINTRIN_H 26 27#include <emmintrin.h> 28 29/* Define the default attributes for the functions in this file. */ 30#define __DEFAULT_FN_ATTRS \ 31 __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) 32 33/// \brief Loads data from an unaligned memory location to elements in a 128-bit 34/// vector. If the address of the data is not 16-byte aligned, the 35/// instruction may read two adjacent aligned blocks of memory to retrieve 36/// the requested data. 37/// 38/// \headerfile <x86intrin.h> 39/// 40/// This intrinsic corresponds to the \c VLDDQU instruction. 41/// 42/// \param __p 43/// A pointer to a 128-bit integer vector containing integer values. 44/// \returns A 128-bit vector containing the moved values. 45static __inline__ __m128i __DEFAULT_FN_ATTRS 46_mm_lddqu_si128(__m128i const *__p) 47{ 48 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 49} 50 51/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 52/// two 128-bit vectors of [4 x float]. 53/// 54/// \headerfile <x86intrin.h> 55/// 56/// This intrinsic corresponds to the \c VADDSUBPS instruction. 57/// 58/// \param __a 59/// A 128-bit vector of [4 x float] containing the left source operand. 60/// \param __b 61/// A 128-bit vector of [4 x float] containing the right source operand. 62/// \returns A 128-bit vector of [4 x float] containing the alternating sums and 63/// differences of both operands. 64static __inline__ __m128 __DEFAULT_FN_ATTRS 65_mm_addsub_ps(__m128 __a, __m128 __b) 66{ 67 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 68} 69 70/// \brief Horizontally adds the adjacent pairs of values contained in two 71/// 128-bit vectors of [4 x float]. 72/// 73/// \headerfile <x86intrin.h> 74/// 75/// This intrinsic corresponds to the \c VHADDPS instruction. 76/// 77/// \param __a 78/// A 128-bit vector of [4 x float] containing one of the source operands. 79/// The horizontal sums of the values are stored in the lower bits of the 80/// destination. 81/// \param __b 82/// A 128-bit vector of [4 x float] containing one of the source operands. 83/// The horizontal sums of the values are stored in the upper bits of the 84/// destination. 85/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 86/// both operands. 87static __inline__ __m128 __DEFAULT_FN_ATTRS 88_mm_hadd_ps(__m128 __a, __m128 __b) 89{ 90 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 91} 92 93/// \brief Horizontally subtracts the adjacent pairs of values contained in two 94/// 128-bit vectors of [4 x float]. 95/// 96/// \headerfile <x86intrin.h> 97/// 98/// This intrinsic corresponds to the \c VHSUBPS instruction. 99/// 100/// \param __a 101/// A 128-bit vector of [4 x float] containing one of the source operands. 102/// The horizontal differences between the values are stored in the lower 103/// bits of the destination. 104/// \param __b 105/// A 128-bit vector of [4 x float] containing one of the source operands. 106/// The horizontal differences between the values are stored in the upper 107/// bits of the destination. 108/// \returns A 128-bit vector of [4 x float] containing the horizontal 109/// differences of both operands. 110static __inline__ __m128 __DEFAULT_FN_ATTRS 111_mm_hsub_ps(__m128 __a, __m128 __b) 112{ 113 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 114} 115 116/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit 117/// vector of [4 x float] to float values stored in a 128-bit vector of 118/// [4 x float]. 119/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 120/// the destination. 121/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 122/// destination. 123/// 124/// \headerfile <x86intrin.h> 125/// 126/// This intrinsic corresponds to the \c VMOVSHDUP instruction. 127/// 128/// \param __a 129/// A 128-bit vector of [4 x float]. 130/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 131/// values. 132static __inline__ __m128 __DEFAULT_FN_ATTRS 133_mm_movehdup_ps(__m128 __a) 134{ 135 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 136} 137 138/// \brief Duplicates low-order (even-indexed) values from a 128-bit 139/// vector of [4 x float] to float values stored in a 128-bit vector of 140/// [4 x float]. 141/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 142/// the destination. 143/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 144/// destination. 145/// 146/// \headerfile <x86intrin.h> 147/// 148/// This intrinsic corresponds to the \c VMOVSLDUP instruction. 149/// 150/// \param __a 151/// A 128-bit vector of [4 x float]. 152/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 153/// values. 154static __inline__ __m128 __DEFAULT_FN_ATTRS 155_mm_moveldup_ps(__m128 __a) 156{ 157 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 158} 159 160/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 161/// two 128-bit vectors of [2 x double]. 162/// 163/// \headerfile <x86intrin.h> 164/// 165/// This intrinsic corresponds to the \c VADDSUBPD instruction. 166/// 167/// \param __a 168/// A 128-bit vector of [2 x double] containing the left source operand. 169/// \param __b 170/// A 128-bit vector of [2 x double] containing the right source operand. 171/// \returns A 128-bit vector of [2 x double] containing the alternating sums 172/// and differences of both operands. 173static __inline__ __m128d __DEFAULT_FN_ATTRS 174_mm_addsub_pd(__m128d __a, __m128d __b) 175{ 176 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 177} 178 179/// \brief Horizontally adds the pairs of values contained in two 128-bit 180/// vectors of [2 x double]. 181/// 182/// \headerfile <x86intrin.h> 183/// 184/// This intrinsic corresponds to the \c VHADDPD instruction. 185/// 186/// \param __a 187/// A 128-bit vector of [2 x double] containing one of the source operands. 188/// The horizontal sum of the values is stored in the lower bits of the 189/// destination. 190/// \param __b 191/// A 128-bit vector of [2 x double] containing one of the source operands. 192/// The horizontal sum of the values is stored in the upper bits of the 193/// destination. 194/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 195/// both operands. 196static __inline__ __m128d __DEFAULT_FN_ATTRS 197_mm_hadd_pd(__m128d __a, __m128d __b) 198{ 199 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 200} 201 202/// \brief Horizontally subtracts the pairs of values contained in two 128-bit 203/// vectors of [2 x double]. 204/// 205/// \headerfile <x86intrin.h> 206/// 207/// This intrinsic corresponds to the \c VHSUBPD instruction. 208/// 209/// \param __a 210/// A 128-bit vector of [2 x double] containing one of the source operands. 211/// The horizontal difference of the values is stored in the lower bits of 212/// the destination. 213/// \param __b 214/// A 128-bit vector of [2 x double] containing one of the source operands. 215/// The horizontal difference of the values is stored in the upper bits of 216/// the destination. 217/// \returns A 128-bit vector of [2 x double] containing the horizontal 218/// differences of both operands. 219static __inline__ __m128d __DEFAULT_FN_ATTRS 220_mm_hsub_pd(__m128d __a, __m128d __b) 221{ 222 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 223} 224 225/// \brief Moves and duplicates one double-precision value to double-precision 226/// values stored in a 128-bit vector of [2 x double]. 227/// 228/// \headerfile <x86intrin.h> 229/// 230/// \code 231/// __m128d _mm_loaddup_pd(double const * dp); 232/// \endcode 233/// 234/// This intrinsic corresponds to the \c VMOVDDUP instruction. 235/// 236/// \param dp 237/// A pointer to a double-precision value to be moved and duplicated. 238/// \returns A 128-bit vector of [2 x double] containing the moved and 239/// duplicated values. 240#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 241 242/// \brief Moves and duplicates the double-precision value in the lower bits of 243/// a 128-bit vector of [2 x double] to double-precision values stored in a 244/// 128-bit vector of [2 x double]. 245/// 246/// \headerfile <x86intrin.h> 247/// 248/// This intrinsic corresponds to the \c VMOVDDUP instruction. 249/// 250/// \param __a 251/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 252/// [127:64] and [63:0] of the destination. 253/// \returns A 128-bit vector of [2 x double] containing the moved and 254/// duplicated values. 255static __inline__ __m128d __DEFAULT_FN_ATTRS 256_mm_movedup_pd(__m128d __a) 257{ 258 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 259} 260 261#define _MM_DENORMALS_ZERO_ON (0x0040) 262#define _MM_DENORMALS_ZERO_OFF (0x0000) 263 264#define _MM_DENORMALS_ZERO_MASK (0x0040) 265 266#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 267#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 268 269/// \brief Establishes a linear address memory range to be monitored and puts 270/// the processor in the monitor event pending state. Data stored in the 271/// monitored address range causes the processor to exit the pending state. 272/// 273/// \headerfile <x86intrin.h> 274/// 275/// This intrinsic corresponds to the \c MONITOR instruction. 276/// 277/// \param __p 278/// The memory range to be monitored. The size of the range is determined by 279/// CPUID function 0000_0005h. 280/// \param __extensions 281/// Optional extensions for the monitoring state. 282/// \param __hints 283/// Optional hints for the monitoring state. 284static __inline__ void __DEFAULT_FN_ATTRS 285_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 286{ 287 __builtin_ia32_monitor((void *)__p, __extensions, __hints); 288} 289 290/// \brief Used with the MONITOR instruction to wait while the processor is in 291/// the monitor event pending state. Data stored in the monitored address 292/// range causes the processor to exit the pending state. 293/// 294/// \headerfile <x86intrin.h> 295/// 296/// This intrinsic corresponds to the \c MWAIT instruction. 297/// 298/// \param __extensions 299/// Optional extensions for the monitoring state, which may vary by 300/// processor. 301/// \param __hints 302/// Optional hints for the monitoring state, which may vary by processor. 303static __inline__ void __DEFAULT_FN_ATTRS 304_mm_mwait(unsigned __extensions, unsigned __hints) 305{ 306 __builtin_ia32_mwait(__extensions, __hints); 307} 308 309#undef __DEFAULT_FN_ATTRS 310 311#endif /* __PMMINTRIN_H */ 312