1/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __PMMINTRIN_H 25#define __PMMINTRIN_H 26 27#include <emmintrin.h> 28 29/* Define the default attributes for the functions in this file. */ 30#define __DEFAULT_FN_ATTRS \ 31 __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) 32 33/// \brief Loads data from an unaligned memory location to elements in a 128-bit 34/// vector. 35/// 36/// If the address of the data is not 16-byte aligned, the instruction may 37/// read two adjacent aligned blocks of memory to retrieve the requested 38/// data. 39/// 40/// \headerfile <x86intrin.h> 41/// 42/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 43/// 44/// \param __p 45/// A pointer to a 128-bit integer vector containing integer values. 46/// \returns A 128-bit vector containing the moved values. 47static __inline__ __m128i __DEFAULT_FN_ATTRS 48_mm_lddqu_si128(__m128i const *__p) 49{ 50 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 51} 52 53/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 54/// two 128-bit vectors of [4 x float]. 55/// 56/// \headerfile <x86intrin.h> 57/// 58/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 59/// 60/// \param __a 61/// A 128-bit vector of [4 x float] containing the left source operand. 62/// \param __b 63/// A 128-bit vector of [4 x float] containing the right source operand. 64/// \returns A 128-bit vector of [4 x float] containing the alternating sums and 65/// differences of both operands. 66static __inline__ __m128 __DEFAULT_FN_ATTRS 67_mm_addsub_ps(__m128 __a, __m128 __b) 68{ 69 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 70} 71 72/// \brief Horizontally adds the adjacent pairs of values contained in two 73/// 128-bit vectors of [4 x float]. 74/// 75/// \headerfile <x86intrin.h> 76/// 77/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 78/// 79/// \param __a 80/// A 128-bit vector of [4 x float] containing one of the source operands. 81/// The horizontal sums of the values are stored in the lower bits of the 82/// destination. 83/// \param __b 84/// A 128-bit vector of [4 x float] containing one of the source operands. 85/// The horizontal sums of the values are stored in the upper bits of the 86/// destination. 87/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 88/// both operands. 89static __inline__ __m128 __DEFAULT_FN_ATTRS 90_mm_hadd_ps(__m128 __a, __m128 __b) 91{ 92 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 93} 94 95/// \brief Horizontally subtracts the adjacent pairs of values contained in two 96/// 128-bit vectors of [4 x float]. 97/// 98/// \headerfile <x86intrin.h> 99/// 100/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 101/// 102/// \param __a 103/// A 128-bit vector of [4 x float] containing one of the source operands. 104/// The horizontal differences between the values are stored in the lower 105/// bits of the destination. 106/// \param __b 107/// A 128-bit vector of [4 x float] containing one of the source operands. 108/// The horizontal differences between the values are stored in the upper 109/// bits of the destination. 110/// \returns A 128-bit vector of [4 x float] containing the horizontal 111/// differences of both operands. 112static __inline__ __m128 __DEFAULT_FN_ATTRS 113_mm_hsub_ps(__m128 __a, __m128 __b) 114{ 115 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 116} 117 118/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit 119/// vector of [4 x float] to float values stored in a 128-bit vector of 120/// [4 x float]. 121/// 122/// \headerfile <x86intrin.h> 123/// 124/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 125/// 126/// \param __a 127/// A 128-bit vector of [4 x float]. \n 128/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 129/// the destination. \n 130/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 131/// destination. 132/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 133/// values. 134static __inline__ __m128 __DEFAULT_FN_ATTRS 135_mm_movehdup_ps(__m128 __a) 136{ 137 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 138} 139 140/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of 141/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 142/// 143/// \headerfile <x86intrin.h> 144/// 145/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 146/// 147/// \param __a 148/// A 128-bit vector of [4 x float] \n 149/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 150/// the destination. \n 151/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 152/// destination. 153/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 154/// values. 155static __inline__ __m128 __DEFAULT_FN_ATTRS 156_mm_moveldup_ps(__m128 __a) 157{ 158 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 159} 160 161/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 162/// two 128-bit vectors of [2 x double]. 163/// 164/// \headerfile <x86intrin.h> 165/// 166/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 167/// 168/// \param __a 169/// A 128-bit vector of [2 x double] containing the left source operand. 170/// \param __b 171/// A 128-bit vector of [2 x double] containing the right source operand. 172/// \returns A 128-bit vector of [2 x double] containing the alternating sums 173/// and differences of both operands. 174static __inline__ __m128d __DEFAULT_FN_ATTRS 175_mm_addsub_pd(__m128d __a, __m128d __b) 176{ 177 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 178} 179 180/// \brief Horizontally adds the pairs of values contained in two 128-bit 181/// vectors of [2 x double]. 182/// 183/// \headerfile <x86intrin.h> 184/// 185/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 186/// 187/// \param __a 188/// A 128-bit vector of [2 x double] containing one of the source operands. 189/// The horizontal sum of the values is stored in the lower bits of the 190/// destination. 191/// \param __b 192/// A 128-bit vector of [2 x double] containing one of the source operands. 193/// The horizontal sum of the values is stored in the upper bits of the 194/// destination. 195/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 196/// both operands. 197static __inline__ __m128d __DEFAULT_FN_ATTRS 198_mm_hadd_pd(__m128d __a, __m128d __b) 199{ 200 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 201} 202 203/// \brief Horizontally subtracts the pairs of values contained in two 128-bit 204/// vectors of [2 x double]. 205/// 206/// \headerfile <x86intrin.h> 207/// 208/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 209/// 210/// \param __a 211/// A 128-bit vector of [2 x double] containing one of the source operands. 212/// The horizontal difference of the values is stored in the lower bits of 213/// the destination. 214/// \param __b 215/// A 128-bit vector of [2 x double] containing one of the source operands. 216/// The horizontal difference of the values is stored in the upper bits of 217/// the destination. 218/// \returns A 128-bit vector of [2 x double] containing the horizontal 219/// differences of both operands. 220static __inline__ __m128d __DEFAULT_FN_ATTRS 221_mm_hsub_pd(__m128d __a, __m128d __b) 222{ 223 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 224} 225 226/// \brief Moves and duplicates one double-precision value to double-precision 227/// values stored in a 128-bit vector of [2 x double]. 228/// 229/// \headerfile <x86intrin.h> 230/// 231/// \code 232/// __m128d _mm_loaddup_pd(double const * dp); 233/// \endcode 234/// 235/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 236/// 237/// \param dp 238/// A pointer to a double-precision value to be moved and duplicated. 239/// \returns A 128-bit vector of [2 x double] containing the moved and 240/// duplicated values. 241#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 242 243/// \brief Moves and duplicates the double-precision value in the lower bits of 244/// a 128-bit vector of [2 x double] to double-precision values stored in a 245/// 128-bit vector of [2 x double]. 246/// 247/// \headerfile <x86intrin.h> 248/// 249/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 250/// 251/// \param __a 252/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 253/// [127:64] and [63:0] of the destination. 254/// \returns A 128-bit vector of [2 x double] containing the moved and 255/// duplicated values. 256static __inline__ __m128d __DEFAULT_FN_ATTRS 257_mm_movedup_pd(__m128d __a) 258{ 259 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 260} 261 262/// \brief Establishes a linear address memory range to be monitored and puts 263/// the processor in the monitor event pending state. Data stored in the 264/// monitored address range causes the processor to exit the pending state. 265/// 266/// \headerfile <x86intrin.h> 267/// 268/// This intrinsic corresponds to the <c> MONITOR </c> instruction. 269/// 270/// \param __p 271/// The memory range to be monitored. The size of the range is determined by 272/// CPUID function 0000_0005h. 273/// \param __extensions 274/// Optional extensions for the monitoring state. 275/// \param __hints 276/// Optional hints for the monitoring state. 277static __inline__ void __DEFAULT_FN_ATTRS 278_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 279{ 280 __builtin_ia32_monitor((void *)__p, __extensions, __hints); 281} 282 283/// \brief Used with the MONITOR instruction to wait while the processor is in 284/// the monitor event pending state. Data stored in the monitored address 285/// range causes the processor to exit the pending state. 286/// 287/// \headerfile <x86intrin.h> 288/// 289/// This intrinsic corresponds to the <c> MWAIT </c> instruction. 290/// 291/// \param __extensions 292/// Optional extensions for the monitoring state, which may vary by 293/// processor. 294/// \param __hints 295/// Optional hints for the monitoring state, which may vary by processor. 296static __inline__ void __DEFAULT_FN_ATTRS 297_mm_mwait(unsigned __extensions, unsigned __hints) 298{ 299 __builtin_ia32_mwait(__extensions, __hints); 300} 301 302#undef __DEFAULT_FN_ATTRS 303 304#endif /* __PMMINTRIN_H */ 305