1//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This provides a class for OpenMP runtime code generation specialized to NVPTX 11// targets. 12// 13//===----------------------------------------------------------------------===// 14 15#include "CGOpenMPRuntimeNVPTX.h" 16#include "clang/AST/DeclOpenMP.h" 17#include "CodeGenFunction.h" 18#include "clang/AST/StmtOpenMP.h" 19 20using namespace clang; 21using namespace CodeGen; 22 23/// \brief Get the GPU warp size. 24llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) { 25 CGBuilderTy &Bld = CGF.Builder; 26 return Bld.CreateCall( 27 llvm::Intrinsic::getDeclaration( 28 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), 29 llvm::None, "nvptx_warp_size"); 30} 31 32/// \brief Get the id of the current thread on the GPU. 33llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) { 34 CGBuilderTy &Bld = CGF.Builder; 35 return Bld.CreateCall( 36 llvm::Intrinsic::getDeclaration( 37 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), 38 llvm::None, "nvptx_tid"); 39} 40 41// \brief Get the maximum number of threads in a block of the GPU. 42llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) { 43 CGBuilderTy &Bld = CGF.Builder; 44 return Bld.CreateCall( 45 llvm::Intrinsic::getDeclaration( 46 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), 47 llvm::None, "nvptx_num_threads"); 48} 49 50/// \brief Get barrier to synchronize all threads in a block. 51void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) { 52 CGBuilderTy &Bld = CGF.Builder; 53 Bld.CreateCall(llvm::Intrinsic::getDeclaration( 54 &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); 55} 56 57// \brief Synchronize all GPU threads in a block. 58void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { 59 getNVPTXCTABarrier(CGF); 60} 61 62/// \brief Get the thread id of the OMP master thread. 63/// The master thread id is the first thread (lane) of the last warp in the 64/// GPU block. Warp size is assumed to be some power of 2. 65/// Thread id is 0 indexed. 66/// E.g: If NumThreads is 33, master id is 32. 67/// If NumThreads is 64, master id is 32. 68/// If NumThreads is 1024, master id is 992. 69llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) { 70 CGBuilderTy &Bld = CGF.Builder; 71 llvm::Value *NumThreads = getNVPTXNumThreads(CGF); 72 73 // We assume that the warp size is a power of 2. 74 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); 75 76 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), 77 Bld.CreateNot(Mask), "master_tid"); 78} 79 80namespace { 81enum OpenMPRTLFunctionNVPTX { 82 /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle, 83 /// kmp_int32 thread_limit); 84 OMPRTL_NVPTX__kmpc_kernel_init, 85}; 86 87// NVPTX Address space 88enum ADDRESS_SPACE { 89 ADDRESS_SPACE_SHARED = 3, 90}; 91} // namespace 92 93CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( 94 CodeGenModule &CGM) 95 : WorkerFn(nullptr), CGFI(nullptr) { 96 createWorkerFunction(CGM); 97} 98 99void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( 100 CodeGenModule &CGM) { 101 // Create an worker function with no arguments. 102 CGFI = &CGM.getTypes().arrangeNullaryFunction(); 103 104 WorkerFn = llvm::Function::Create( 105 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, 106 /* placeholder */ "_worker", &CGM.getModule()); 107 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); 108 WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage); 109 WorkerFn->addFnAttr(llvm::Attribute::NoInline); 110} 111 112void CGOpenMPRuntimeNVPTX::initializeEnvironment() { 113 // 114 // Initialize master-worker control state in shared memory. 115 // 116 117 auto DL = CGM.getDataLayout(); 118 ActiveWorkers = new llvm::GlobalVariable( 119 CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false, 120 llvm::GlobalValue::CommonLinkage, 121 llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0, 122 llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); 123 ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty)); 124 125 WorkID = new llvm::GlobalVariable( 126 CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false, 127 llvm::GlobalValue::CommonLinkage, 128 llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0, 129 llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); 130 WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty)); 131} 132 133void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { 134 auto &Ctx = CGM.getContext(); 135 136 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 137 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); 138 emitWorkerLoop(CGF, WST); 139 CGF.FinishFunction(); 140} 141 142void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, 143 WorkerFunctionState &WST) { 144 // 145 // The workers enter this loop and wait for parallel work from the master. 146 // When the master encounters a parallel region it sets up the work + variable 147 // arguments, and wakes up the workers. The workers first check to see if 148 // they are required for the parallel region, i.e., within the # of requested 149 // parallel threads. The activated workers load the variable arguments and 150 // execute the parallel work. 151 // 152 153 CGBuilderTy &Bld = CGF.Builder; 154 155 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); 156 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); 157 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); 158 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); 159 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); 160 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 161 162 CGF.EmitBranch(AwaitBB); 163 164 // Workers wait for work from master. 165 CGF.EmitBlock(AwaitBB); 166 // Wait for parallel work 167 syncCTAThreads(CGF); 168 // On termination condition (workid == 0), exit loop. 169 llvm::Value *ShouldTerminate = Bld.CreateICmpEQ( 170 Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()), 171 llvm::Constant::getNullValue(WorkID->getType()->getElementType()), 172 "should_terminate"); 173 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); 174 175 // Activate requested workers. 176 CGF.EmitBlock(SelectWorkersBB); 177 llvm::Value *ThreadID = getNVPTXThreadID(CGF); 178 llvm::Value *ActiveThread = Bld.CreateICmpSLT( 179 ThreadID, 180 Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()), 181 "active_thread"); 182 Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB); 183 184 // Signal start of parallel region. 185 CGF.EmitBlock(ExecuteBB); 186 // TODO: Add parallel work. 187 188 // Signal end of parallel region. 189 CGF.EmitBlock(TerminateBB); 190 CGF.EmitBranch(BarrierBB); 191 192 // All active and inactive workers wait at a barrier after parallel region. 193 CGF.EmitBlock(BarrierBB); 194 // Barrier after parallel region. 195 syncCTAThreads(CGF); 196 CGF.EmitBranch(AwaitBB); 197 198 // Exit target region. 199 CGF.EmitBlock(ExitBB); 200} 201 202// Setup NVPTX threads for master-worker OpenMP scheme. 203void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF, 204 EntryFunctionState &EST, 205 WorkerFunctionState &WST) { 206 CGBuilderTy &Bld = CGF.Builder; 207 208 // Get the master thread id. 209 llvm::Value *MasterID = getMasterThreadID(CGF); 210 // Current thread's identifier. 211 llvm::Value *ThreadID = getNVPTXThreadID(CGF); 212 213 // Setup BBs in entry function. 214 llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker"); 215 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); 216 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); 217 EST.ExitBB = CGF.createBasicBlock(".exit"); 218 219 // The head (master thread) marches on while its body of companion threads in 220 // the warp go to sleep. 221 llvm::Value *ShouldDie = 222 Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp"); 223 Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB); 224 225 // Select worker threads... 226 CGF.EmitBlock(WorkerCheckBB); 227 llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker"); 228 Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB); 229 230 // ... and send to worker loop, awaiting parallel invocation. 231 CGF.EmitBlock(WorkerBB); 232 CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); 233 CGF.EmitBranch(EST.ExitBB); 234 235 // Only master thread executes subsequent serial code. 236 CGF.EmitBlock(MasterBB); 237 238 // First action in sequential region: 239 // Initialize the state of the OpenMP runtime library on the GPU. 240 llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)}; 241 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), 242 Args); 243} 244 245void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF, 246 EntryFunctionState &EST) { 247 CGBuilderTy &Bld = CGF.Builder; 248 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); 249 CGF.EmitBranch(TerminateBB); 250 251 CGF.EmitBlock(TerminateBB); 252 // Signal termination condition. 253 Bld.CreateAlignedStore( 254 llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID, 255 WorkID->getAlignment()); 256 // Barrier to terminate worker threads. 257 syncCTAThreads(CGF); 258 // Master thread jumps to exit point. 259 CGF.EmitBranch(EST.ExitBB); 260 261 CGF.EmitBlock(EST.ExitBB); 262} 263 264/// \brief Returns specified OpenMP runtime function for the current OpenMP 265/// implementation. Specialized for the NVPTX device. 266/// \param Function OpenMP runtime function. 267/// \return Specified function. 268llvm::Constant * 269CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { 270 llvm::Constant *RTLFn = nullptr; 271 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { 272 case OMPRTL_NVPTX__kmpc_kernel_init: { 273 // Build void __kmpc_kernel_init(kmp_int32 omp_handle, 274 // kmp_int32 thread_limit); 275 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty}; 276 llvm::FunctionType *FnTy = 277 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); 278 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); 279 break; 280 } 281 } 282 return RTLFn; 283} 284 285void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, 286 llvm::Constant *Addr, 287 uint64_t Size) { 288 auto *F = dyn_cast<llvm::Function>(Addr); 289 // TODO: Add support for global variables on the device after declare target 290 // support. 291 if (!F) 292 return; 293 llvm::Module *M = F->getParent(); 294 llvm::LLVMContext &Ctx = M->getContext(); 295 296 // Get "nvvm.annotations" metadata node 297 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); 298 299 llvm::Metadata *MDVals[] = { 300 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), 301 llvm::ConstantAsMetadata::get( 302 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; 303 // Append metadata to nvvm.annotations 304 MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); 305} 306 307void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( 308 const OMPExecutableDirective &D, StringRef ParentName, 309 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 310 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 311 if (!IsOffloadEntry) // Nothing to do. 312 return; 313 314 assert(!ParentName.empty() && "Invalid target region parent name!"); 315 316 EntryFunctionState EST; 317 WorkerFunctionState WST(CGM); 318 319 // Emit target region as a standalone region. 320 class NVPTXPrePostActionTy : public PrePostActionTy { 321 CGOpenMPRuntimeNVPTX &RT; 322 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; 323 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; 324 325 public: 326 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, 327 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, 328 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) 329 : RT(RT), EST(EST), WST(WST) {} 330 void Enter(CodeGenFunction &CGF) override { 331 RT.emitEntryHeader(CGF, EST, WST); 332 } 333 void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); } 334 } Action(*this, EST, WST); 335 CodeGen.setAction(Action); 336 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 337 IsOffloadEntry, CodeGen); 338 339 // Create the worker function 340 emitWorkerFunction(WST); 341 342 // Now change the name of the worker function to correspond to this target 343 // region's entry function. 344 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); 345} 346 347CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) 348 : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) { 349 if (!CGM.getLangOpts().OpenMPIsDevice) 350 llvm_unreachable("OpenMP NVPTX can only handle device code."); 351 352 // Called once per module during initialization. 353 initializeEnvironment(); 354} 355 356void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, 357 const Expr *NumTeams, 358 const Expr *ThreadLimit, 359 SourceLocation Loc) {} 360 361llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( 362 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, 363 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { 364 365 llvm::Function *OutlinedFun = nullptr; 366 if (isa<OMPTeamsDirective>(D)) { 367 llvm::Value *OutlinedFunVal = 368 CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( 369 D, ThreadIDVar, InnermostKind, CodeGen); 370 OutlinedFun = cast<llvm::Function>(OutlinedFunVal); 371 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); 372 } else 373 llvm_unreachable("parallel directive is not yet supported for nvptx " 374 "backend."); 375 376 return OutlinedFun; 377} 378 379void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, 380 const OMPExecutableDirective &D, 381 SourceLocation Loc, 382 llvm::Value *OutlinedFn, 383 ArrayRef<llvm::Value *> CapturedVars) { 384 if (!CGF.HaveInsertPoint()) 385 return; 386 387 Address ZeroAddr = 388 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), 389 /*Name*/ ".zero.addr"); 390 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); 391 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 392 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 393 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 394 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 395 CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); 396} 397