1// REQUIRES: nvptx-registered-target 2// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s 3// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s 4 5#define __device__ __attribute__((device)) 6#define __global__ __attribute__((global)) 7#define __shared__ __attribute__((shared)) 8#define __constant__ __attribute__((constant)) 9 10__device__ int read_tid() { 11 12// CHECK: call i32 @llvm.ptx.read.tid.x() 13// CHECK: call i32 @llvm.ptx.read.tid.y() 14// CHECK: call i32 @llvm.ptx.read.tid.z() 15// CHECK: call i32 @llvm.ptx.read.tid.w() 16 17 int x = __builtin_ptx_read_tid_x(); 18 int y = __builtin_ptx_read_tid_y(); 19 int z = __builtin_ptx_read_tid_z(); 20 int w = __builtin_ptx_read_tid_w(); 21 22 return x + y + z + w; 23 24} 25 26__device__ int read_ntid() { 27 28// CHECK: call i32 @llvm.ptx.read.ntid.x() 29// CHECK: call i32 @llvm.ptx.read.ntid.y() 30// CHECK: call i32 @llvm.ptx.read.ntid.z() 31// CHECK: call i32 @llvm.ptx.read.ntid.w() 32 33 int x = __builtin_ptx_read_ntid_x(); 34 int y = __builtin_ptx_read_ntid_y(); 35 int z = __builtin_ptx_read_ntid_z(); 36 int w = __builtin_ptx_read_ntid_w(); 37 38 return x + y + z + w; 39 40} 41 42__device__ int read_ctaid() { 43 44// CHECK: call i32 @llvm.ptx.read.ctaid.x() 45// CHECK: call i32 @llvm.ptx.read.ctaid.y() 46// CHECK: call i32 @llvm.ptx.read.ctaid.z() 47// CHECK: call i32 @llvm.ptx.read.ctaid.w() 48 49 int x = __builtin_ptx_read_ctaid_x(); 50 int y = __builtin_ptx_read_ctaid_y(); 51 int z = __builtin_ptx_read_ctaid_z(); 52 int w = __builtin_ptx_read_ctaid_w(); 53 54 return x + y + z + w; 55 56} 57 58__device__ int read_nctaid() { 59 60// CHECK: call i32 @llvm.ptx.read.nctaid.x() 61// CHECK: call i32 @llvm.ptx.read.nctaid.y() 62// CHECK: call i32 @llvm.ptx.read.nctaid.z() 63// CHECK: call i32 @llvm.ptx.read.nctaid.w() 64 65 int x = __builtin_ptx_read_nctaid_x(); 66 int y = __builtin_ptx_read_nctaid_y(); 67 int z = __builtin_ptx_read_nctaid_z(); 68 int w = __builtin_ptx_read_nctaid_w(); 69 70 return x + y + z + w; 71 72} 73 74__device__ int read_ids() { 75 76// CHECK: call i32 @llvm.ptx.read.laneid() 77// CHECK: call i32 @llvm.ptx.read.warpid() 78// CHECK: call i32 @llvm.ptx.read.nwarpid() 79// CHECK: call i32 @llvm.ptx.read.smid() 80// CHECK: call i32 @llvm.ptx.read.nsmid() 81// CHECK: call i32 @llvm.ptx.read.gridid() 82 83 int a = __builtin_ptx_read_laneid(); 84 int b = __builtin_ptx_read_warpid(); 85 int c = __builtin_ptx_read_nwarpid(); 86 int d = __builtin_ptx_read_smid(); 87 int e = __builtin_ptx_read_nsmid(); 88 int f = __builtin_ptx_read_gridid(); 89 90 return a + b + c + d + e + f; 91 92} 93 94__device__ int read_lanemasks() { 95 96// CHECK: call i32 @llvm.ptx.read.lanemask.eq() 97// CHECK: call i32 @llvm.ptx.read.lanemask.le() 98// CHECK: call i32 @llvm.ptx.read.lanemask.lt() 99// CHECK: call i32 @llvm.ptx.read.lanemask.ge() 100// CHECK: call i32 @llvm.ptx.read.lanemask.gt() 101 102 int a = __builtin_ptx_read_lanemask_eq(); 103 int b = __builtin_ptx_read_lanemask_le(); 104 int c = __builtin_ptx_read_lanemask_lt(); 105 int d = __builtin_ptx_read_lanemask_ge(); 106 int e = __builtin_ptx_read_lanemask_gt(); 107 108 return a + b + c + d + e; 109 110} 111 112__device__ long long read_clocks() { 113 114// CHECK: call i32 @llvm.ptx.read.clock() 115// CHECK: call i64 @llvm.ptx.read.clock64() 116 117 int a = __builtin_ptx_read_clock(); 118 long long b = __builtin_ptx_read_clock64(); 119 120 return a + b; 121} 122 123__device__ int read_pms() { 124 125// CHECK: call i32 @llvm.ptx.read.pm0() 126// CHECK: call i32 @llvm.ptx.read.pm1() 127// CHECK: call i32 @llvm.ptx.read.pm2() 128// CHECK: call i32 @llvm.ptx.read.pm3() 129 130 int a = __builtin_ptx_read_pm0(); 131 int b = __builtin_ptx_read_pm1(); 132 int c = __builtin_ptx_read_pm2(); 133 int d = __builtin_ptx_read_pm3(); 134 135 return a + b + c + d; 136 137} 138 139__device__ void sync() { 140 141// CHECK: call void @llvm.ptx.bar.sync(i32 0) 142 143 __builtin_ptx_bar_sync(0); 144 145} 146 147 148// NVVM intrinsics 149 150// The idea is not to test all intrinsics, just that Clang is recognizing the 151// builtins defined in BuiltinsNVPTX.def 152__device__ void nvvm_math(float f1, float f2, double d1, double d2) { 153// CHECK: call float @llvm.nvvm.fmax.f 154 float t1 = __nvvm_fmax_f(f1, f2); 155// CHECK: call float @llvm.nvvm.fmin.f 156 float t2 = __nvvm_fmin_f(f1, f2); 157// CHECK: call float @llvm.nvvm.sqrt.rn.f 158 float t3 = __nvvm_sqrt_rn_f(f1); 159// CHECK: call float @llvm.nvvm.rcp.rn.f 160 float t4 = __nvvm_rcp_rn_f(f2); 161// CHECK: call float @llvm.nvvm.add.rn.f 162 float t5 = __nvvm_add_rn_f(f1, f2); 163 164// CHECK: call double @llvm.nvvm.fmax.d 165 double td1 = __nvvm_fmax_d(d1, d2); 166// CHECK: call double @llvm.nvvm.fmin.d 167 double td2 = __nvvm_fmin_d(d1, d2); 168// CHECK: call double @llvm.nvvm.sqrt.rn.d 169 double td3 = __nvvm_sqrt_rn_d(d1); 170// CHECK: call double @llvm.nvvm.rcp.rn.d 171 double td4 = __nvvm_rcp_rn_d(d2); 172 173// CHECK: call void @llvm.nvvm.membar.cta() 174 __nvvm_membar_cta(); 175// CHECK: call void @llvm.nvvm.membar.gl() 176 __nvvm_membar_gl(); 177// CHECK: call void @llvm.nvvm.membar.sys() 178 __nvvm_membar_sys(); 179// CHECK: call void @llvm.nvvm.barrier0() 180 __nvvm_bar0(); 181} 182 183__device__ int di; 184__shared__ int si; 185__device__ long dl; 186__shared__ long sl; 187__device__ long long dll; 188__shared__ long long sll; 189 190// Check for atomic intrinsics 191// CHECK-LABEL: nvvm_atom 192__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l, 193 long long *llp, long long ll) { 194 // CHECK: atomicrmw add 195 __nvvm_atom_add_gen_i(ip, i); 196 // CHECK: atomicrmw add 197 __nvvm_atom_add_gen_l(&dl, l); 198 // CHECK: atomicrmw add 199 __nvvm_atom_add_gen_ll(&sll, ll); 200 201 // CHECK: atomicrmw sub 202 __nvvm_atom_sub_gen_i(ip, i); 203 // CHECK: atomicrmw sub 204 __nvvm_atom_sub_gen_l(&dl, l); 205 // CHECK: atomicrmw sub 206 __nvvm_atom_sub_gen_ll(&sll, ll); 207 208 // CHECK: atomicrmw and 209 __nvvm_atom_and_gen_i(ip, i); 210 // CHECK: atomicrmw and 211 __nvvm_atom_and_gen_l(&dl, l); 212 // CHECK: atomicrmw and 213 __nvvm_atom_and_gen_ll(&sll, ll); 214 215 // CHECK: atomicrmw or 216 __nvvm_atom_or_gen_i(ip, i); 217 // CHECK: atomicrmw or 218 __nvvm_atom_or_gen_l(&dl, l); 219 // CHECK: atomicrmw or 220 __nvvm_atom_or_gen_ll(&sll, ll); 221 222 // CHECK: atomicrmw xor 223 __nvvm_atom_xor_gen_i(ip, i); 224 // CHECK: atomicrmw xor 225 __nvvm_atom_xor_gen_l(&dl, l); 226 // CHECK: atomicrmw xor 227 __nvvm_atom_xor_gen_ll(&sll, ll); 228 229 // CHECK: atomicrmw xchg 230 __nvvm_atom_xchg_gen_i(ip, i); 231 // CHECK: atomicrmw xchg 232 __nvvm_atom_xchg_gen_l(&dl, l); 233 // CHECK: atomicrmw xchg 234 __nvvm_atom_xchg_gen_ll(&sll, ll); 235 236 // CHECK: atomicrmw max i32* 237 __nvvm_atom_max_gen_i(ip, i); 238 // CHECK: atomicrmw umax i32* 239 __nvvm_atom_max_gen_ui((unsigned int *)ip, i); 240 // CHECK: atomicrmw max 241 __nvvm_atom_max_gen_l(&dl, l); 242 // CHECK: atomicrmw umax 243 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); 244 // CHECK: atomicrmw max i64* 245 __nvvm_atom_max_gen_ll(&sll, ll); 246 // CHECK: atomicrmw umax i64* 247 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); 248 249 // CHECK: atomicrmw min i32* 250 __nvvm_atom_min_gen_i(ip, i); 251 // CHECK: atomicrmw umin i32* 252 __nvvm_atom_min_gen_ui((unsigned int *)ip, i); 253 // CHECK: atomicrmw min 254 __nvvm_atom_min_gen_l(&dl, l); 255 // CHECK: atomicrmw umin 256 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); 257 // CHECK: atomicrmw min i64* 258 __nvvm_atom_min_gen_ll(&sll, ll); 259 // CHECK: atomicrmw umin i64* 260 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); 261 262 // CHECK: cmpxchg 263 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 264 __nvvm_atom_cas_gen_i(ip, 0, i); 265 // CHECK: cmpxchg 266 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0 267 __nvvm_atom_cas_gen_l(&dl, 0, l); 268 // CHECK: cmpxchg 269 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0 270 __nvvm_atom_cas_gen_ll(&sll, 0, ll); 271 272 // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32 273 __nvvm_atom_add_gen_f(fp, f); 274 275 // CHECK: ret 276} 277