1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This provides a class for CUDA code generation targeting the NVIDIA CUDA 11// runtime library. 12// 13//===----------------------------------------------------------------------===// 14 15#include "CGCUDARuntime.h" 16#include "CodeGenFunction.h" 17#include "CodeGenModule.h" 18#include "clang/AST/Decl.h" 19#include "llvm/IR/BasicBlock.h" 20#include "llvm/IR/CallSite.h" 21#include "llvm/IR/Constants.h" 22#include "llvm/IR/DerivedTypes.h" 23 24using namespace clang; 25using namespace CodeGen; 26 27namespace { 28 29class CGNVCUDARuntime : public CGCUDARuntime { 30 31private: 32 llvm::Type *IntTy, *SizeTy, *VoidTy; 33 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34 35 /// Convenience reference to LLVM Context 36 llvm::LLVMContext &Context; 37 /// Convenience reference to the current module 38 llvm::Module &TheModule; 39 /// Keeps track of kernel launch stubs emitted in this module 40 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 42 /// Keeps track of variables containing handles of GPU binaries. Populated by 43 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 44 /// ModuleDtorFunction() 45 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 46 47 llvm::Constant *getSetupArgumentFn() const; 48 llvm::Constant *getLaunchFn() const; 49 50 /// Creates a function to register all kernel stubs generated in this module. 51 llvm::Function *makeRegisterGlobalsFn(); 52 53 /// Helper function that generates a constant string and returns a pointer to 54 /// the start of the string. The result of this function can be used anywhere 55 /// where the C code specifies const char*. 56 llvm::Constant *makeConstantString(const std::string &Str, 57 const std::string &Name = "", 58 unsigned Alignment = 0) { 59 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 60 llvm::ConstantInt::get(SizeTy, 0)}; 61 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 62 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 63 ConstStr.getPointer(), Zeros); 64 } 65 66 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 67 68public: 69 CGNVCUDARuntime(CodeGenModule &CGM); 70 71 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 72 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 73 DeviceVars.push_back(std::make_pair(&Var, Flags)); 74 } 75 76 /// Creates module constructor function 77 llvm::Function *makeModuleCtorFunction() override; 78 /// Creates module destructor function 79 llvm::Function *makeModuleDtorFunction() override; 80}; 81 82} 83 84CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 85 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 86 TheModule(CGM.getModule()) { 87 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 88 ASTContext &Ctx = CGM.getContext(); 89 90 IntTy = Types.ConvertType(Ctx.IntTy); 91 SizeTy = Types.ConvertType(Ctx.getSizeType()); 92 VoidTy = llvm::Type::getVoidTy(Context); 93 94 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 95 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 96 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 97} 98 99llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 100 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 101 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 102 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 103 Params, false), 104 "cudaSetupArgument"); 105} 106 107llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 108 // cudaError_t cudaLaunch(char *) 109 return CGM.CreateRuntimeFunction( 110 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 111} 112 113void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 114 FunctionArgList &Args) { 115 EmittedKernels.push_back(CGF.CurFn); 116 emitDeviceStubBody(CGF, Args); 117} 118 119void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 120 FunctionArgList &Args) { 121 // Build the argument value list and the argument stack struct type. 122 SmallVector<llvm::Value *, 16> ArgValues; 123 std::vector<llvm::Type *> ArgTypes; 124 for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); 125 I != E; ++I) { 126 llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer(); 127 ArgValues.push_back(V); 128 assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType"); 129 ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType()); 130 } 131 llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); 132 133 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 134 135 // Emit the calls to cudaSetupArgument 136 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 137 for (unsigned I = 0, E = Args.size(); I != E; ++I) { 138 llvm::Value *Args[3]; 139 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 140 Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); 141 Args[1] = CGF.Builder.CreateIntCast( 142 llvm::ConstantExpr::getSizeOf(ArgTypes[I]), 143 SizeTy, false); 144 Args[2] = CGF.Builder.CreateIntCast( 145 llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), 146 SizeTy, false); 147 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 148 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 149 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 150 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 151 CGF.EmitBlock(NextBlock); 152 } 153 154 // Emit the call to cudaLaunch 155 llvm::Constant *cudaLaunchFn = getLaunchFn(); 156 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 157 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 158 CGF.EmitBranch(EndBlock); 159 160 CGF.EmitBlock(EndBlock); 161} 162 163/// Creates a function that sets up state on the host side for CUDA objects that 164/// have a presence on both the host and device sides. Specifically, registers 165/// the host side of kernel functions and device global variables with the CUDA 166/// runtime. 167/// \code 168/// void __cuda_register_globals(void** GpuBinaryHandle) { 169/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 170/// ... 171/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 172/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 173/// ... 174/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 175/// } 176/// \endcode 177llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 178 // No need to register anything 179 if (EmittedKernels.empty() && DeviceVars.empty()) 180 return nullptr; 181 182 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 183 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 184 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 185 llvm::BasicBlock *EntryBB = 186 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 187 CGBuilderTy Builder(CGM, Context); 188 Builder.SetInsertPoint(EntryBB); 189 190 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 191 // int, uint3*, uint3*, dim3*, dim3*, int*) 192 llvm::Type *RegisterFuncParams[] = { 193 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 194 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 195 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 196 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 197 "__cudaRegisterFunction"); 198 199 // Extract GpuBinaryHandle passed as the first argument passed to 200 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 201 // each emitted kernel. 202 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 203 for (llvm::Function *Kernel : EmittedKernels) { 204 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 205 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 206 llvm::Value *Args[] = { 207 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 208 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 209 NullPtr, NullPtr, NullPtr, 210 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 211 Builder.CreateCall(RegisterFunc, Args); 212 } 213 214 // void __cudaRegisterVar(void **, char *, char *, const char *, 215 // int, int, int, int) 216 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 217 CharPtrTy, IntTy, IntTy, 218 IntTy, IntTy}; 219 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 220 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 221 "__cudaRegisterVar"); 222 for (auto &Pair : DeviceVars) { 223 llvm::GlobalVariable *Var = Pair.first; 224 unsigned Flags = Pair.second; 225 llvm::Constant *VarName = makeConstantString(Var->getName()); 226 uint64_t VarSize = 227 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 228 llvm::Value *Args[] = { 229 &GpuBinaryHandlePtr, 230 Builder.CreateBitCast(Var, VoidPtrTy), 231 VarName, 232 VarName, 233 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 234 llvm::ConstantInt::get(IntTy, VarSize), 235 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 236 llvm::ConstantInt::get(IntTy, 0)}; 237 Builder.CreateCall(RegisterVar, Args); 238 } 239 240 Builder.CreateRetVoid(); 241 return RegisterKernelsFunc; 242} 243 244/// Creates a global constructor function for the module: 245/// \code 246/// void __cuda_module_ctor(void*) { 247/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 248/// __cuda_register_globals(Handle0); 249/// ... 250/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 251/// __cuda_register_globals(HandleN); 252/// } 253/// \endcode 254llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 255 // No need to generate ctors/dtors if there are no GPU binaries. 256 if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) 257 return nullptr; 258 259 // void __cuda_register_globals(void* handle); 260 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 261 // void ** __cudaRegisterFatBinary(void *); 262 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 263 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 264 "__cudaRegisterFatBinary"); 265 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 266 llvm::StructType *FatbinWrapperTy = 267 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 268 269 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 270 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 271 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 272 llvm::BasicBlock *CtorEntryBB = 273 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 274 CGBuilderTy CtorBuilder(CGM, Context); 275 276 CtorBuilder.SetInsertPoint(CtorEntryBB); 277 278 // For each GPU binary, register it with the CUDA runtime and store returned 279 // handle in a global variable and save the handle in GpuBinaryHandles vector 280 // to be cleaned up in destructor on exit. Then associate all known kernels 281 // with the GPU binary handle so CUDA runtime can figure out what to call on 282 // the GPU side. 283 for (const std::string &GpuBinaryFileName : 284 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 285 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 286 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 287 if (std::error_code EC = GpuBinaryOrErr.getError()) { 288 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 289 << EC.message(); 290 continue; 291 } 292 293 // Create initialized wrapper structure that points to the loaded GPU binary 294 llvm::Constant *Values[] = { 295 llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 296 llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 297 makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. 298 llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 299 llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 300 TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 301 llvm::ConstantStruct::get(FatbinWrapperTy, Values), 302 "__cuda_fatbin_wrapper"); 303 // NVIDIA's cuobjdump looks for fatbins in this section. 304 FatbinWrapper->setSection(".nvFatBinSegment"); 305 306 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 307 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 308 RegisterFatbinFunc, 309 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 310 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 311 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 312 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 313 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 314 CGM.getPointerAlign()); 315 316 // Call __cuda_register_globals(GpuBinaryHandle); 317 if (RegisterGlobalsFunc) 318 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 319 320 // Save GpuBinaryHandle so we can unregister it in destructor. 321 GpuBinaryHandles.push_back(GpuBinaryHandle); 322 } 323 324 CtorBuilder.CreateRetVoid(); 325 return ModuleCtorFunc; 326} 327 328/// Creates a global destructor function that unregisters all GPU code blobs 329/// registered by constructor. 330/// \code 331/// void __cuda_module_dtor(void*) { 332/// __cudaUnregisterFatBinary(Handle0); 333/// ... 334/// __cudaUnregisterFatBinary(HandleN); 335/// } 336/// \endcode 337llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 338 // No need for destructor if we don't have handles to unregister. 339 if (GpuBinaryHandles.empty()) 340 return nullptr; 341 342 // void __cudaUnregisterFatBinary(void ** handle); 343 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 344 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 345 "__cudaUnregisterFatBinary"); 346 347 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 348 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 349 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 350 llvm::BasicBlock *DtorEntryBB = 351 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 352 CGBuilderTy DtorBuilder(CGM, Context); 353 DtorBuilder.SetInsertPoint(DtorEntryBB); 354 355 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 356 auto HandleValue = 357 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 358 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 359 } 360 361 DtorBuilder.CreateRetVoid(); 362 return ModuleDtorFunc; 363} 364 365CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 366 return new CGNVCUDARuntime(CGM); 367} 368