1/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23#ifndef __CLANG_CUDA_INTRINSICS_H__
24#define __CLANG_CUDA_INTRINSICS_H__
25#ifndef __CUDA__
26#error "This file is for CUDA compilation only."
27#endif
28
29// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
30
31// Prevent the vanilla sm_32 intrinsics header from being included.
32#define __SM_32_INTRINSICS_H__
33#define __SM_32_INTRINSICS_HPP__
34
35#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
36
37inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
38inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
39inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
40inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
41inline __device__ long long __ldg(const long long *ptr) {
42  return __nvvm_ldg_ll(ptr);
43}
44inline __device__ unsigned char __ldg(const unsigned char *ptr) {
45  return __nvvm_ldg_uc(ptr);
46}
47inline __device__ unsigned short __ldg(const unsigned short *ptr) {
48  return __nvvm_ldg_us(ptr);
49}
50inline __device__ unsigned int __ldg(const unsigned int *ptr) {
51  return __nvvm_ldg_ui(ptr);
52}
53inline __device__ unsigned long __ldg(const unsigned long *ptr) {
54  return __nvvm_ldg_ul(ptr);
55}
56inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
57  return __nvvm_ldg_ull(ptr);
58}
59inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
60inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
61
62inline __device__ char2 __ldg(const char2 *ptr) {
63  typedef char c2 __attribute__((ext_vector_type(2)));
64  // We can assume that ptr is aligned at least to char2's alignment, but the
65  // load will assume that ptr is aligned to char2's alignment.  This is only
66  // safe if alignof(c2) <= alignof(char2).
67  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
68  char2 ret;
69  ret.x = rv[0];
70  ret.y = rv[1];
71  return ret;
72}
73inline __device__ char4 __ldg(const char4 *ptr) {
74  typedef char c4 __attribute__((ext_vector_type(4)));
75  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
76  char4 ret;
77  ret.x = rv[0];
78  ret.y = rv[1];
79  ret.z = rv[2];
80  ret.w = rv[3];
81  return ret;
82}
83inline __device__ short2 __ldg(const short2 *ptr) {
84  typedef short s2 __attribute__((ext_vector_type(2)));
85  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
86  short2 ret;
87  ret.x = rv[0];
88  ret.y = rv[1];
89  return ret;
90}
91inline __device__ short4 __ldg(const short4 *ptr) {
92  typedef short s4 __attribute__((ext_vector_type(4)));
93  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
94  short4 ret;
95  ret.x = rv[0];
96  ret.y = rv[1];
97  ret.z = rv[2];
98  ret.w = rv[3];
99  return ret;
100}
101inline __device__ int2 __ldg(const int2 *ptr) {
102  typedef int i2 __attribute__((ext_vector_type(2)));
103  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
104  int2 ret;
105  ret.x = rv[0];
106  ret.y = rv[1];
107  return ret;
108}
109inline __device__ int4 __ldg(const int4 *ptr) {
110  typedef int i4 __attribute__((ext_vector_type(4)));
111  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
112  int4 ret;
113  ret.x = rv[0];
114  ret.y = rv[1];
115  ret.z = rv[2];
116  ret.w = rv[3];
117  return ret;
118}
119inline __device__ longlong2 __ldg(const longlong2 *ptr) {
120  typedef long long ll2 __attribute__((ext_vector_type(2)));
121  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
122  longlong2 ret;
123  ret.x = rv[0];
124  ret.y = rv[1];
125  return ret;
126}
127
128inline __device__ uchar2 __ldg(const uchar2 *ptr) {
129  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
130  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
131  uchar2 ret;
132  ret.x = rv[0];
133  ret.y = rv[1];
134  return ret;
135}
136inline __device__ uchar4 __ldg(const uchar4 *ptr) {
137  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
138  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
139  uchar4 ret;
140  ret.x = rv[0];
141  ret.y = rv[1];
142  ret.z = rv[2];
143  ret.w = rv[3];
144  return ret;
145}
146inline __device__ ushort2 __ldg(const ushort2 *ptr) {
147  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
148  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
149  ushort2 ret;
150  ret.x = rv[0];
151  ret.y = rv[1];
152  return ret;
153}
154inline __device__ ushort4 __ldg(const ushort4 *ptr) {
155  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
156  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
157  ushort4 ret;
158  ret.x = rv[0];
159  ret.y = rv[1];
160  ret.z = rv[2];
161  ret.w = rv[3];
162  return ret;
163}
164inline __device__ uint2 __ldg(const uint2 *ptr) {
165  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
166  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
167  uint2 ret;
168  ret.x = rv[0];
169  ret.y = rv[1];
170  return ret;
171}
172inline __device__ uint4 __ldg(const uint4 *ptr) {
173  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
174  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
175  uint4 ret;
176  ret.x = rv[0];
177  ret.y = rv[1];
178  ret.z = rv[2];
179  ret.w = rv[3];
180  return ret;
181}
182inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
183  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
184  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
185  ulonglong2 ret;
186  ret.x = rv[0];
187  ret.y = rv[1];
188  return ret;
189}
190
191inline __device__ float2 __ldg(const float2 *ptr) {
192  typedef float f2 __attribute__((ext_vector_type(2)));
193  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
194  float2 ret;
195  ret.x = rv[0];
196  ret.y = rv[1];
197  return ret;
198}
199inline __device__ float4 __ldg(const float4 *ptr) {
200  typedef float f4 __attribute__((ext_vector_type(4)));
201  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
202  float4 ret;
203  ret.x = rv[0];
204  ret.y = rv[1];
205  ret.z = rv[2];
206  ret.w = rv[3];
207  return ret;
208}
209inline __device__ double2 __ldg(const double2 *ptr) {
210  typedef double d2 __attribute__((ext_vector_type(2)));
211  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
212  double2 ret;
213  ret.x = rv[0];
214  ret.y = rv[1];
215  return ret;
216}
217
218// TODO: Implement these as intrinsics, so the backend can work its magic on
219// these.  Alternatively, we could implement these as plain C and try to get
220// llvm to recognize the relevant patterns.
221inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
222                                           unsigned shiftWidth) {
223  unsigned result;
224  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
225      : "=r"(result)
226      : "r"(low32), "r"(high32), "r"(shiftWidth));
227  return result;
228}
229inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
230                                            unsigned shiftWidth) {
231  unsigned result;
232  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
233      : "=r"(result)
234      : "r"(low32), "r"(high32), "r"(shiftWidth));
235  return result;
236}
237inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
238                                           unsigned shiftWidth) {
239  unsigned result;
240  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
241      : "=r"(result)
242      : "r"(low32), "r"(high32), "r"(shiftWidth));
243  return result;
244}
245inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
246                                            unsigned shiftWidth) {
247  unsigned ret;
248  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
249      : "=r"(ret)
250      : "r"(low32), "r"(high32), "r"(shiftWidth));
251  return ret;
252}
253
254#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
255
256#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
257