1// REQUIRES: nvptx-registered-target
2// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
3// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
4
5#define __device__ __attribute__((device))
6#define __global__ __attribute__((global))
7#define __shared__ __attribute__((shared))
8#define __constant__ __attribute__((constant))
9
10__device__ int read_tid() {
11
12// CHECK: call i32 @llvm.ptx.read.tid.x()
13// CHECK: call i32 @llvm.ptx.read.tid.y()
14// CHECK: call i32 @llvm.ptx.read.tid.z()
15// CHECK: call i32 @llvm.ptx.read.tid.w()
16
17  int x = __builtin_ptx_read_tid_x();
18  int y = __builtin_ptx_read_tid_y();
19  int z = __builtin_ptx_read_tid_z();
20  int w = __builtin_ptx_read_tid_w();
21
22  return x + y + z + w;
23
24}
25
26__device__ int read_ntid() {
27
28// CHECK: call i32 @llvm.ptx.read.ntid.x()
29// CHECK: call i32 @llvm.ptx.read.ntid.y()
30// CHECK: call i32 @llvm.ptx.read.ntid.z()
31// CHECK: call i32 @llvm.ptx.read.ntid.w()
32
33  int x = __builtin_ptx_read_ntid_x();
34  int y = __builtin_ptx_read_ntid_y();
35  int z = __builtin_ptx_read_ntid_z();
36  int w = __builtin_ptx_read_ntid_w();
37
38  return x + y + z + w;
39
40}
41
42__device__ int read_ctaid() {
43
44// CHECK: call i32 @llvm.ptx.read.ctaid.x()
45// CHECK: call i32 @llvm.ptx.read.ctaid.y()
46// CHECK: call i32 @llvm.ptx.read.ctaid.z()
47// CHECK: call i32 @llvm.ptx.read.ctaid.w()
48
49  int x = __builtin_ptx_read_ctaid_x();
50  int y = __builtin_ptx_read_ctaid_y();
51  int z = __builtin_ptx_read_ctaid_z();
52  int w = __builtin_ptx_read_ctaid_w();
53
54  return x + y + z + w;
55
56}
57
58__device__ int read_nctaid() {
59
60// CHECK: call i32 @llvm.ptx.read.nctaid.x()
61// CHECK: call i32 @llvm.ptx.read.nctaid.y()
62// CHECK: call i32 @llvm.ptx.read.nctaid.z()
63// CHECK: call i32 @llvm.ptx.read.nctaid.w()
64
65  int x = __builtin_ptx_read_nctaid_x();
66  int y = __builtin_ptx_read_nctaid_y();
67  int z = __builtin_ptx_read_nctaid_z();
68  int w = __builtin_ptx_read_nctaid_w();
69
70  return x + y + z + w;
71
72}
73
74__device__ int read_ids() {
75
76// CHECK: call i32 @llvm.ptx.read.laneid()
77// CHECK: call i32 @llvm.ptx.read.warpid()
78// CHECK: call i32 @llvm.ptx.read.nwarpid()
79// CHECK: call i32 @llvm.ptx.read.smid()
80// CHECK: call i32 @llvm.ptx.read.nsmid()
81// CHECK: call i32 @llvm.ptx.read.gridid()
82
83  int a = __builtin_ptx_read_laneid();
84  int b = __builtin_ptx_read_warpid();
85  int c = __builtin_ptx_read_nwarpid();
86  int d = __builtin_ptx_read_smid();
87  int e = __builtin_ptx_read_nsmid();
88  int f = __builtin_ptx_read_gridid();
89
90  return a + b + c + d + e + f;
91
92}
93
94__device__ int read_lanemasks() {
95
96// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
97// CHECK: call i32 @llvm.ptx.read.lanemask.le()
98// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
99// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
100// CHECK: call i32 @llvm.ptx.read.lanemask.gt()
101
102  int a = __builtin_ptx_read_lanemask_eq();
103  int b = __builtin_ptx_read_lanemask_le();
104  int c = __builtin_ptx_read_lanemask_lt();
105  int d = __builtin_ptx_read_lanemask_ge();
106  int e = __builtin_ptx_read_lanemask_gt();
107
108  return a + b + c + d + e;
109
110}
111
112__device__ long long read_clocks() {
113
114// CHECK: call i32 @llvm.ptx.read.clock()
115// CHECK: call i64 @llvm.ptx.read.clock64()
116
117  int a = __builtin_ptx_read_clock();
118  long long b = __builtin_ptx_read_clock64();
119
120  return a + b;
121}
122
123__device__ int read_pms() {
124
125// CHECK: call i32 @llvm.ptx.read.pm0()
126// CHECK: call i32 @llvm.ptx.read.pm1()
127// CHECK: call i32 @llvm.ptx.read.pm2()
128// CHECK: call i32 @llvm.ptx.read.pm3()
129
130  int a = __builtin_ptx_read_pm0();
131  int b = __builtin_ptx_read_pm1();
132  int c = __builtin_ptx_read_pm2();
133  int d = __builtin_ptx_read_pm3();
134
135  return a + b + c + d;
136
137}
138
139__device__ void sync() {
140
141// CHECK: call void @llvm.ptx.bar.sync(i32 0)
142
143  __builtin_ptx_bar_sync(0);
144
145}
146
147
148// NVVM intrinsics
149
150// The idea is not to test all intrinsics, just that Clang is recognizing the
151// builtins defined in BuiltinsNVPTX.def
152__device__ void nvvm_math(float f1, float f2, double d1, double d2) {
153// CHECK: call float @llvm.nvvm.fmax.f
154  float t1 = __nvvm_fmax_f(f1, f2);
155// CHECK: call float @llvm.nvvm.fmin.f
156  float t2 = __nvvm_fmin_f(f1, f2);
157// CHECK: call float @llvm.nvvm.sqrt.rn.f
158  float t3 = __nvvm_sqrt_rn_f(f1);
159// CHECK: call float @llvm.nvvm.rcp.rn.f
160  float t4 = __nvvm_rcp_rn_f(f2);
161// CHECK: call float @llvm.nvvm.add.rn.f
162  float t5 = __nvvm_add_rn_f(f1, f2);
163
164// CHECK: call double @llvm.nvvm.fmax.d
165  double td1 = __nvvm_fmax_d(d1, d2);
166// CHECK: call double @llvm.nvvm.fmin.d
167  double td2 = __nvvm_fmin_d(d1, d2);
168// CHECK: call double @llvm.nvvm.sqrt.rn.d
169  double td3 = __nvvm_sqrt_rn_d(d1);
170// CHECK: call double @llvm.nvvm.rcp.rn.d
171  double td4 = __nvvm_rcp_rn_d(d2);
172
173// CHECK: call void @llvm.nvvm.membar.cta()
174  __nvvm_membar_cta();
175// CHECK: call void @llvm.nvvm.membar.gl()
176  __nvvm_membar_gl();
177// CHECK: call void @llvm.nvvm.membar.sys()
178  __nvvm_membar_sys();
179// CHECK: call void @llvm.nvvm.barrier0()
180  __nvvm_bar0();
181}
182
183__device__ int di;
184__shared__ int si;
185__device__ long dl;
186__shared__ long sl;
187__device__ long long dll;
188__shared__ long long sll;
189
190// Check for atomic intrinsics
191// CHECK-LABEL: nvvm_atom
192__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
193                          long long *llp, long long ll) {
194  // CHECK: atomicrmw add
195  __nvvm_atom_add_gen_i(ip, i);
196  // CHECK: atomicrmw add
197  __nvvm_atom_add_gen_l(&dl, l);
198  // CHECK: atomicrmw add
199  __nvvm_atom_add_gen_ll(&sll, ll);
200
201  // CHECK: atomicrmw sub
202  __nvvm_atom_sub_gen_i(ip, i);
203  // CHECK: atomicrmw sub
204  __nvvm_atom_sub_gen_l(&dl, l);
205  // CHECK: atomicrmw sub
206  __nvvm_atom_sub_gen_ll(&sll, ll);
207
208  // CHECK: atomicrmw and
209  __nvvm_atom_and_gen_i(ip, i);
210  // CHECK: atomicrmw and
211  __nvvm_atom_and_gen_l(&dl, l);
212  // CHECK: atomicrmw and
213  __nvvm_atom_and_gen_ll(&sll, ll);
214
215  // CHECK: atomicrmw or
216  __nvvm_atom_or_gen_i(ip, i);
217  // CHECK: atomicrmw or
218  __nvvm_atom_or_gen_l(&dl, l);
219  // CHECK: atomicrmw or
220  __nvvm_atom_or_gen_ll(&sll, ll);
221
222  // CHECK: atomicrmw xor
223  __nvvm_atom_xor_gen_i(ip, i);
224  // CHECK: atomicrmw xor
225  __nvvm_atom_xor_gen_l(&dl, l);
226  // CHECK: atomicrmw xor
227  __nvvm_atom_xor_gen_ll(&sll, ll);
228
229  // CHECK: atomicrmw xchg
230  __nvvm_atom_xchg_gen_i(ip, i);
231  // CHECK: atomicrmw xchg
232  __nvvm_atom_xchg_gen_l(&dl, l);
233  // CHECK: atomicrmw xchg
234  __nvvm_atom_xchg_gen_ll(&sll, ll);
235
236  // CHECK: atomicrmw max i32*
237  __nvvm_atom_max_gen_i(ip, i);
238  // CHECK: atomicrmw umax i32*
239  __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
240  // CHECK: atomicrmw max
241  __nvvm_atom_max_gen_l(&dl, l);
242  // CHECK: atomicrmw umax
243  __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
244  // CHECK: atomicrmw max i64*
245  __nvvm_atom_max_gen_ll(&sll, ll);
246  // CHECK: atomicrmw umax i64*
247  __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
248
249  // CHECK: atomicrmw min i32*
250  __nvvm_atom_min_gen_i(ip, i);
251  // CHECK: atomicrmw umin i32*
252  __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
253  // CHECK: atomicrmw min
254  __nvvm_atom_min_gen_l(&dl, l);
255  // CHECK: atomicrmw umin
256  __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
257  // CHECK: atomicrmw min i64*
258  __nvvm_atom_min_gen_ll(&sll, ll);
259  // CHECK: atomicrmw umin i64*
260  __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
261
262  // CHECK: cmpxchg
263  // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
264  __nvvm_atom_cas_gen_i(ip, 0, i);
265  // CHECK: cmpxchg
266  // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
267  __nvvm_atom_cas_gen_l(&dl, 0, l);
268  // CHECK: cmpxchg
269  // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
270  __nvvm_atom_cas_gen_ll(&sll, 0, ll);
271
272  // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
273  __nvvm_atom_add_gen_f(fp, f);
274
275  // CHECK: ret
276}
277