1/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23#ifndef __CLANG_CUDA_INTRINSICS_H__ 24#define __CLANG_CUDA_INTRINSICS_H__ 25#ifndef __CUDA__ 26#error "This file is for CUDA compilation only." 27#endif 28 29// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 30 31// Prevent the vanilla sm_32 intrinsics header from being included. 32#define __SM_32_INTRINSICS_H__ 33#define __SM_32_INTRINSICS_HPP__ 34 35#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 36 37inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 38inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 39inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 40inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 41inline __device__ long long __ldg(const long long *ptr) { 42 return __nvvm_ldg_ll(ptr); 43} 44inline __device__ unsigned char __ldg(const unsigned char *ptr) { 45 return __nvvm_ldg_uc(ptr); 46} 47inline __device__ unsigned short __ldg(const unsigned short *ptr) { 48 return __nvvm_ldg_us(ptr); 49} 50inline __device__ unsigned int __ldg(const unsigned int *ptr) { 51 return __nvvm_ldg_ui(ptr); 52} 53inline __device__ unsigned long __ldg(const unsigned long *ptr) { 54 return __nvvm_ldg_ul(ptr); 55} 56inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 57 return __nvvm_ldg_ull(ptr); 58} 59inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 60inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 61 62inline __device__ char2 __ldg(const char2 *ptr) { 63 typedef char c2 __attribute__((ext_vector_type(2))); 64 // We can assume that ptr is aligned at least to char2's alignment, but the 65 // load will assume that ptr is aligned to char2's alignment. This is only 66 // safe if alignof(c2) <= alignof(char2). 67 c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 68 char2 ret; 69 ret.x = rv[0]; 70 ret.y = rv[1]; 71 return ret; 72} 73inline __device__ char4 __ldg(const char4 *ptr) { 74 typedef char c4 __attribute__((ext_vector_type(4))); 75 c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 76 char4 ret; 77 ret.x = rv[0]; 78 ret.y = rv[1]; 79 ret.z = rv[2]; 80 ret.w = rv[3]; 81 return ret; 82} 83inline __device__ short2 __ldg(const short2 *ptr) { 84 typedef short s2 __attribute__((ext_vector_type(2))); 85 s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 86 short2 ret; 87 ret.x = rv[0]; 88 ret.y = rv[1]; 89 return ret; 90} 91inline __device__ short4 __ldg(const short4 *ptr) { 92 typedef short s4 __attribute__((ext_vector_type(4))); 93 s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 94 short4 ret; 95 ret.x = rv[0]; 96 ret.y = rv[1]; 97 ret.z = rv[2]; 98 ret.w = rv[3]; 99 return ret; 100} 101inline __device__ int2 __ldg(const int2 *ptr) { 102 typedef int i2 __attribute__((ext_vector_type(2))); 103 i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 104 int2 ret; 105 ret.x = rv[0]; 106 ret.y = rv[1]; 107 return ret; 108} 109inline __device__ int4 __ldg(const int4 *ptr) { 110 typedef int i4 __attribute__((ext_vector_type(4))); 111 i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 112 int4 ret; 113 ret.x = rv[0]; 114 ret.y = rv[1]; 115 ret.z = rv[2]; 116 ret.w = rv[3]; 117 return ret; 118} 119inline __device__ longlong2 __ldg(const longlong2 *ptr) { 120 typedef long long ll2 __attribute__((ext_vector_type(2))); 121 ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 122 longlong2 ret; 123 ret.x = rv[0]; 124 ret.y = rv[1]; 125 return ret; 126} 127 128inline __device__ uchar2 __ldg(const uchar2 *ptr) { 129 typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 130 uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 131 uchar2 ret; 132 ret.x = rv[0]; 133 ret.y = rv[1]; 134 return ret; 135} 136inline __device__ uchar4 __ldg(const uchar4 *ptr) { 137 typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 138 uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 139 uchar4 ret; 140 ret.x = rv[0]; 141 ret.y = rv[1]; 142 ret.z = rv[2]; 143 ret.w = rv[3]; 144 return ret; 145} 146inline __device__ ushort2 __ldg(const ushort2 *ptr) { 147 typedef unsigned short us2 __attribute__((ext_vector_type(2))); 148 us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 149 ushort2 ret; 150 ret.x = rv[0]; 151 ret.y = rv[1]; 152 return ret; 153} 154inline __device__ ushort4 __ldg(const ushort4 *ptr) { 155 typedef unsigned short us4 __attribute__((ext_vector_type(4))); 156 us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 157 ushort4 ret; 158 ret.x = rv[0]; 159 ret.y = rv[1]; 160 ret.z = rv[2]; 161 ret.w = rv[3]; 162 return ret; 163} 164inline __device__ uint2 __ldg(const uint2 *ptr) { 165 typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 166 ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 167 uint2 ret; 168 ret.x = rv[0]; 169 ret.y = rv[1]; 170 return ret; 171} 172inline __device__ uint4 __ldg(const uint4 *ptr) { 173 typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 174 ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 175 uint4 ret; 176 ret.x = rv[0]; 177 ret.y = rv[1]; 178 ret.z = rv[2]; 179 ret.w = rv[3]; 180 return ret; 181} 182inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 183 typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 184 ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 185 ulonglong2 ret; 186 ret.x = rv[0]; 187 ret.y = rv[1]; 188 return ret; 189} 190 191inline __device__ float2 __ldg(const float2 *ptr) { 192 typedef float f2 __attribute__((ext_vector_type(2))); 193 f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 194 float2 ret; 195 ret.x = rv[0]; 196 ret.y = rv[1]; 197 return ret; 198} 199inline __device__ float4 __ldg(const float4 *ptr) { 200 typedef float f4 __attribute__((ext_vector_type(4))); 201 f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 202 float4 ret; 203 ret.x = rv[0]; 204 ret.y = rv[1]; 205 ret.z = rv[2]; 206 ret.w = rv[3]; 207 return ret; 208} 209inline __device__ double2 __ldg(const double2 *ptr) { 210 typedef double d2 __attribute__((ext_vector_type(2))); 211 d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 212 double2 ret; 213 ret.x = rv[0]; 214 ret.y = rv[1]; 215 return ret; 216} 217 218// TODO: Implement these as intrinsics, so the backend can work its magic on 219// these. Alternatively, we could implement these as plain C and try to get 220// llvm to recognize the relevant patterns. 221inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 222 unsigned shiftWidth) { 223 unsigned result; 224 asm("shf.l.wrap.b32 %0, %1, %2, %3;" 225 : "=r"(result) 226 : "r"(low32), "r"(high32), "r"(shiftWidth)); 227 return result; 228} 229inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 230 unsigned shiftWidth) { 231 unsigned result; 232 asm("shf.l.clamp.b32 %0, %1, %2, %3;" 233 : "=r"(result) 234 : "r"(low32), "r"(high32), "r"(shiftWidth)); 235 return result; 236} 237inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 238 unsigned shiftWidth) { 239 unsigned result; 240 asm("shf.r.wrap.b32 %0, %1, %2, %3;" 241 : "=r"(result) 242 : "r"(low32), "r"(high32), "r"(shiftWidth)); 243 return result; 244} 245inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 246 unsigned shiftWidth) { 247 unsigned ret; 248 asm("shf.r.clamp.b32 %0, %1, %2, %3;" 249 : "=r"(ret) 250 : "r"(low32), "r"(high32), "r"(shiftWidth)); 251 return ret; 252} 253 254#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 255 256#endif // defined(__CLANG_CUDA_INTRINSICS_H__) 257