1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief This pass adds instructions to enable whole quad mode for pixel 12/// shaders. 13/// 14/// Whole quad mode is required for derivative computations, but it interferes 15/// with shader side effects (stores and atomics). This pass is run on the 16/// scheduled machine IR but before register coalescing, so that machine SSA is 17/// available for analysis. It ensures that WQM is enabled when necessary, but 18/// disabled around stores and atomics. 19/// 20/// When necessary, this pass creates a function prolog 21/// 22/// S_MOV_B64 LiveMask, EXEC 23/// S_WQM_B64 EXEC, EXEC 24/// 25/// to enter WQM at the top of the function and surrounds blocks of Exact 26/// instructions by 27/// 28/// S_AND_SAVEEXEC_B64 Tmp, LiveMask 29/// ... 30/// S_MOV_B64 EXEC, Tmp 31/// 32/// In order to avoid excessive switching during sequences of Exact 33/// instructions, the pass first analyzes which instructions must be run in WQM 34/// (aka which instructions produce values that lead to derivative 35/// computations). 36/// 37/// Basic blocks are always exited in WQM as long as some successor needs WQM. 38/// 39/// There is room for improvement given better control flow analysis: 40/// 41/// (1) at the top level (outside of control flow statements, and as long as 42/// kill hasn't been used), one SGPR can be saved by recovering WQM from 43/// the LiveMask (this is implemented for the entry block). 44/// 45/// (2) when entire regions (e.g. if-else blocks or entire loops) only 46/// consist of exact and don't-care instructions, the switch only has to 47/// be done at the entry and exit points rather than potentially in each 48/// block of the region. 49/// 50//===----------------------------------------------------------------------===// 51 52#include "AMDGPU.h" 53#include "AMDGPUSubtarget.h" 54#include "SIInstrInfo.h" 55#include "SIMachineFunctionInfo.h" 56#include "llvm/CodeGen/MachineFunction.h" 57#include "llvm/CodeGen/MachineFunctionPass.h" 58#include "llvm/CodeGen/MachineInstrBuilder.h" 59#include "llvm/CodeGen/MachineRegisterInfo.h" 60 61using namespace llvm; 62 63#define DEBUG_TYPE "si-wqm" 64 65namespace { 66 67enum { 68 StateWQM = 0x1, 69 StateExact = 0x2, 70}; 71 72struct InstrInfo { 73 char Needs = 0; 74 char OutNeeds = 0; 75}; 76 77struct BlockInfo { 78 char Needs = 0; 79 char InNeeds = 0; 80 char OutNeeds = 0; 81}; 82 83struct WorkItem { 84 MachineBasicBlock *MBB = nullptr; 85 MachineInstr *MI = nullptr; 86 87 WorkItem() {} 88 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} 89 WorkItem(MachineInstr *MI) : MI(MI) {} 90}; 91 92class SIWholeQuadMode : public MachineFunctionPass { 93private: 94 const SIInstrInfo *TII; 95 const SIRegisterInfo *TRI; 96 MachineRegisterInfo *MRI; 97 98 DenseMap<const MachineInstr *, InstrInfo> Instructions; 99 DenseMap<MachineBasicBlock *, BlockInfo> Blocks; 100 SmallVector<const MachineInstr *, 2> ExecExports; 101 SmallVector<MachineInstr *, 1> LiveMaskQueries; 102 103 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); 104 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); 105 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); 106 char analyzeFunction(MachineFunction &MF); 107 108 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 109 unsigned SaveWQM, unsigned LiveMaskReg); 110 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 111 unsigned SavedWQM); 112 void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); 113 114 void lowerLiveMaskQueries(unsigned LiveMaskReg); 115 116public: 117 static char ID; 118 119 SIWholeQuadMode() : 120 MachineFunctionPass(ID) { } 121 122 bool runOnMachineFunction(MachineFunction &MF) override; 123 124 const char *getPassName() const override { 125 return "SI Whole Quad Mode"; 126 } 127 128 void getAnalysisUsage(AnalysisUsage &AU) const override { 129 AU.setPreservesCFG(); 130 MachineFunctionPass::getAnalysisUsage(AU); 131 } 132}; 133 134} // End anonymous namespace 135 136char SIWholeQuadMode::ID = 0; 137 138INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, 139 "SI Whole Quad Mode", false, false) 140 141char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; 142 143FunctionPass *llvm::createSIWholeQuadModePass() { 144 return new SIWholeQuadMode; 145} 146 147// Scan instructions to determine which ones require an Exact execmask and 148// which ones seed WQM requirements. 149char SIWholeQuadMode::scanInstructions(MachineFunction &MF, 150 std::vector<WorkItem> &Worklist) { 151 char GlobalFlags = 0; 152 bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); 153 154 for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { 155 MachineBasicBlock &MBB = *BI; 156 157 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { 158 MachineInstr &MI = *II; 159 unsigned Opcode = MI.getOpcode(); 160 char Flags = 0; 161 162 if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { 163 Flags = StateWQM; 164 } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { 165 Flags = StateExact; 166 } else { 167 // Handle export instructions with the exec mask valid flag set 168 if (Opcode == AMDGPU::EXP) { 169 if (MI.getOperand(4).getImm() != 0) 170 ExecExports.push_back(&MI); 171 } else if (Opcode == AMDGPU::SI_PS_LIVE) { 172 LiveMaskQueries.push_back(&MI); 173 } else if (WQMOutputs) { 174 // The function is in machine SSA form, which means that physical 175 // VGPRs correspond to shader inputs and outputs. Inputs are 176 // only used, outputs are only defined. 177 for (const MachineOperand &MO : MI.defs()) { 178 if (!MO.isReg()) 179 continue; 180 181 unsigned Reg = MO.getReg(); 182 183 if (!TRI->isVirtualRegister(Reg) && 184 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { 185 Flags = StateWQM; 186 break; 187 } 188 } 189 } 190 191 if (!Flags) 192 continue; 193 } 194 195 Instructions[&MI].Needs = Flags; 196 Worklist.push_back(&MI); 197 GlobalFlags |= Flags; 198 } 199 200 if (WQMOutputs && MBB.succ_empty()) { 201 // This is a prolog shader. Make sure we go back to exact mode at the end. 202 Blocks[&MBB].OutNeeds = StateExact; 203 Worklist.push_back(&MBB); 204 GlobalFlags |= StateExact; 205 } 206 } 207 208 return GlobalFlags; 209} 210 211void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, 212 std::vector<WorkItem>& Worklist) { 213 MachineBasicBlock *MBB = MI.getParent(); 214 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references 215 BlockInfo &BI = Blocks[MBB]; 216 217 // Control flow-type instructions that are followed by WQM computations 218 // must themselves be in WQM. 219 if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { 220 Instructions[&MI].Needs = StateWQM; 221 II.Needs = StateWQM; 222 } 223 224 // Propagate to block level 225 BI.Needs |= II.Needs; 226 if ((BI.InNeeds | II.Needs) != BI.InNeeds) { 227 BI.InNeeds |= II.Needs; 228 Worklist.push_back(MBB); 229 } 230 231 // Propagate backwards within block 232 if (MachineInstr *PrevMI = MI.getPrevNode()) { 233 char InNeeds = II.Needs | II.OutNeeds; 234 if (!PrevMI->isPHI()) { 235 InstrInfo &PrevII = Instructions[PrevMI]; 236 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { 237 PrevII.OutNeeds |= InNeeds; 238 Worklist.push_back(PrevMI); 239 } 240 } 241 } 242 243 // Propagate WQM flag to instruction inputs 244 assert(II.Needs != (StateWQM | StateExact)); 245 if (II.Needs != StateWQM) 246 return; 247 248 for (const MachineOperand &Use : MI.uses()) { 249 if (!Use.isReg() || !Use.isUse()) 250 continue; 251 252 // At this point, physical registers appear as inputs or outputs 253 // and following them makes no sense (and would in fact be incorrect 254 // when the same VGPR is used as both an output and an input that leads 255 // to a NeedsWQM instruction). 256 // 257 // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we 258 // have to trace this, in practice it happens for 64-bit computations like 259 // pointers where both dwords are followed already anyway. 260 if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) 261 continue; 262 263 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { 264 InstrInfo &DefII = Instructions[&DefMI]; 265 266 // Obviously skip if DefMI is already flagged as NeedWQM. 267 // 268 // The instruction might also be flagged as NeedExact. This happens when 269 // the result of an atomic is used in a WQM computation. In this case, 270 // the atomic must not run for helper pixels and the WQM result is 271 // undefined. 272 if (DefII.Needs != 0) 273 continue; 274 275 DefII.Needs = StateWQM; 276 Worklist.push_back(&DefMI); 277 } 278 } 279} 280 281void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, 282 std::vector<WorkItem>& Worklist) { 283 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. 284 285 // Propagate through instructions 286 if (!MBB.empty()) { 287 MachineInstr *LastMI = &*MBB.rbegin(); 288 InstrInfo &LastII = Instructions[LastMI]; 289 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { 290 LastII.OutNeeds |= BI.OutNeeds; 291 Worklist.push_back(LastMI); 292 } 293 } 294 295 // Predecessor blocks must provide for our WQM/Exact needs. 296 for (MachineBasicBlock *Pred : MBB.predecessors()) { 297 BlockInfo &PredBI = Blocks[Pred]; 298 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) 299 continue; 300 301 PredBI.OutNeeds |= BI.InNeeds; 302 PredBI.InNeeds |= BI.InNeeds; 303 Worklist.push_back(Pred); 304 } 305 306 // All successors must be prepared to accept the same set of WQM/Exact data. 307 for (MachineBasicBlock *Succ : MBB.successors()) { 308 BlockInfo &SuccBI = Blocks[Succ]; 309 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) 310 continue; 311 312 SuccBI.InNeeds |= BI.OutNeeds; 313 Worklist.push_back(Succ); 314 } 315} 316 317char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { 318 std::vector<WorkItem> Worklist; 319 char GlobalFlags = scanInstructions(MF, Worklist); 320 321 while (!Worklist.empty()) { 322 WorkItem WI = Worklist.back(); 323 Worklist.pop_back(); 324 325 if (WI.MI) 326 propagateInstruction(*WI.MI, Worklist); 327 else 328 propagateBlock(*WI.MBB, Worklist); 329 } 330 331 return GlobalFlags; 332} 333 334void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, 335 MachineBasicBlock::iterator Before, 336 unsigned SaveWQM, unsigned LiveMaskReg) { 337 if (SaveWQM) { 338 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), 339 SaveWQM) 340 .addReg(LiveMaskReg); 341 } else { 342 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), 343 AMDGPU::EXEC) 344 .addReg(AMDGPU::EXEC) 345 .addReg(LiveMaskReg); 346 } 347} 348 349void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, 350 MachineBasicBlock::iterator Before, 351 unsigned SavedWQM) { 352 if (SavedWQM) { 353 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) 354 .addReg(SavedWQM); 355 } else { 356 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 357 AMDGPU::EXEC) 358 .addReg(AMDGPU::EXEC); 359 } 360} 361 362void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, 363 bool isEntry) { 364 auto BII = Blocks.find(&MBB); 365 if (BII == Blocks.end()) 366 return; 367 368 const BlockInfo &BI = BII->second; 369 370 if (!(BI.InNeeds & StateWQM)) 371 return; 372 373 // This is a non-entry block that is WQM throughout, so no need to do 374 // anything. 375 if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) 376 return; 377 378 unsigned SavedWQMReg = 0; 379 bool WQMFromExec = isEntry; 380 char State = isEntry ? StateExact : StateWQM; 381 382 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 383 while (II != IE) { 384 MachineInstr &MI = *II; 385 ++II; 386 387 // Skip instructions that are not affected by EXEC 388 if (TII->isScalarUnit(MI) && !MI.isTerminator()) 389 continue; 390 391 // Generic instructions such as COPY will either disappear by register 392 // coalescing or be lowered to SALU or VALU instructions. 393 if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { 394 if (MI.getNumExplicitOperands() >= 1) { 395 const MachineOperand &Op = MI.getOperand(0); 396 if (Op.isReg()) { 397 if (TRI->isSGPRReg(*MRI, Op.getReg())) { 398 // SGPR instructions are not affected by EXEC 399 continue; 400 } 401 } 402 } 403 } 404 405 char Needs = 0; 406 char OutNeeds = 0; 407 auto InstrInfoIt = Instructions.find(&MI); 408 if (InstrInfoIt != Instructions.end()) { 409 Needs = InstrInfoIt->second.Needs; 410 OutNeeds = InstrInfoIt->second.OutNeeds; 411 412 // Make sure to switch to Exact mode before the end of the block when 413 // Exact and only Exact is needed further downstream. 414 if (OutNeeds == StateExact && MI.isTerminator()) { 415 assert(Needs == 0); 416 Needs = StateExact; 417 } 418 } 419 420 // State switching 421 if (Needs && State != Needs) { 422 if (Needs == StateExact) { 423 assert(!SavedWQMReg); 424 425 if (!WQMFromExec && (OutNeeds & StateWQM)) 426 SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 427 428 toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); 429 } else { 430 assert(WQMFromExec == (SavedWQMReg == 0)); 431 toWQM(MBB, &MI, SavedWQMReg); 432 SavedWQMReg = 0; 433 } 434 435 State = Needs; 436 } 437 } 438 439 if ((BI.OutNeeds & StateWQM) && State != StateWQM) { 440 assert(WQMFromExec == (SavedWQMReg == 0)); 441 toWQM(MBB, MBB.end(), SavedWQMReg); 442 } else if (BI.OutNeeds == StateExact && State != StateExact) { 443 toExact(MBB, MBB.end(), 0, LiveMaskReg); 444 } 445} 446 447void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { 448 for (MachineInstr *MI : LiveMaskQueries) { 449 const DebugLoc &DL = MI->getDebugLoc(); 450 unsigned Dest = MI->getOperand(0).getReg(); 451 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) 452 .addReg(LiveMaskReg); 453 MI->eraseFromParent(); 454 } 455} 456 457bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { 458 if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) 459 return false; 460 461 Instructions.clear(); 462 Blocks.clear(); 463 ExecExports.clear(); 464 LiveMaskQueries.clear(); 465 466 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 467 468 TII = ST.getInstrInfo(); 469 TRI = &TII->getRegisterInfo(); 470 MRI = &MF.getRegInfo(); 471 472 char GlobalFlags = analyzeFunction(MF); 473 if (!(GlobalFlags & StateWQM)) { 474 lowerLiveMaskQueries(AMDGPU::EXEC); 475 return !LiveMaskQueries.empty(); 476 } 477 478 // Store a copy of the original live mask when required 479 unsigned LiveMaskReg = 0; 480 { 481 MachineBasicBlock &Entry = MF.front(); 482 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); 483 484 if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { 485 LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 486 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) 487 .addReg(AMDGPU::EXEC); 488 } 489 490 if (GlobalFlags == StateWQM) { 491 // For a shader that needs only WQM, we can just set it once. 492 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 493 AMDGPU::EXEC) 494 .addReg(AMDGPU::EXEC); 495 496 lowerLiveMaskQueries(LiveMaskReg); 497 // EntryMI may become invalid here 498 return true; 499 } 500 } 501 502 lowerLiveMaskQueries(LiveMaskReg); 503 504 // Handle the general case 505 for (auto BII : Blocks) 506 processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); 507 508 return true; 509} 510