1//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Insert wait instructions for memory reads and writes. 12/// 13/// Memory reads and writes are issued asynchronously, so we need to insert 14/// S_WAITCNT instructions when we want to access any of their results or 15/// overwrite any register that's used asynchronously. 16// 17//===----------------------------------------------------------------------===// 18 19#include "AMDGPU.h" 20#include "AMDGPUSubtarget.h" 21#include "SIDefines.h" 22#include "SIInstrInfo.h" 23#include "SIMachineFunctionInfo.h" 24#include "llvm/CodeGen/MachineFunction.h" 25#include "llvm/CodeGen/MachineFunctionPass.h" 26#include "llvm/CodeGen/MachineInstrBuilder.h" 27#include "llvm/CodeGen/MachineRegisterInfo.h" 28 29using namespace llvm; 30 31namespace { 32 33/// \brief One variable for each of the hardware counters 34typedef union { 35 struct { 36 unsigned VM; 37 unsigned EXP; 38 unsigned LGKM; 39 } Named; 40 unsigned Array[3]; 41 42} Counters; 43 44typedef enum { 45 OTHER, 46 SMEM, 47 VMEM 48} InstType; 49 50typedef Counters RegCounters[512]; 51typedef std::pair<unsigned, unsigned> RegInterval; 52 53class SIInsertWaits : public MachineFunctionPass { 54 55private: 56 static char ID; 57 const SIInstrInfo *TII; 58 const SIRegisterInfo *TRI; 59 const MachineRegisterInfo *MRI; 60 61 /// \brief Constant hardware limits 62 static const Counters WaitCounts; 63 64 /// \brief Constant zero value 65 static const Counters ZeroCounts; 66 67 /// \brief Counter values we have already waited on. 68 Counters WaitedOn; 69 70 /// \brief Counter values for last instruction issued. 71 Counters LastIssued; 72 73 /// \brief Registers used by async instructions. 74 RegCounters UsedRegs; 75 76 /// \brief Registers defined by async instructions. 77 RegCounters DefinedRegs; 78 79 /// \brief Different export instruction types seen since last wait. 80 unsigned ExpInstrTypesSeen; 81 82 /// \brief Type of the last opcode. 83 InstType LastOpcodeType; 84 85 bool LastInstWritesM0; 86 87 /// \brief Get increment/decrement amount for this instruction. 88 Counters getHwCounts(MachineInstr &MI); 89 90 /// \brief Is operand relevant for async execution? 91 bool isOpRelevant(MachineOperand &Op); 92 93 /// \brief Get register interval an operand affects. 94 RegInterval getRegInterval(MachineOperand &Op); 95 96 /// \brief Handle instructions async components 97 void pushInstruction(MachineBasicBlock &MBB, 98 MachineBasicBlock::iterator I); 99 100 /// \brief Insert the actual wait instruction 101 bool insertWait(MachineBasicBlock &MBB, 102 MachineBasicBlock::iterator I, 103 const Counters &Counts); 104 105 /// \brief Do we need def2def checks? 106 bool unorderedDefines(MachineInstr &MI); 107 108 /// \brief Resolve all operand dependencies to counter requirements 109 Counters handleOperands(MachineInstr &MI); 110 111 /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. 112 void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); 113 114public: 115 SIInsertWaits(TargetMachine &tm) : 116 MachineFunctionPass(ID), 117 TII(nullptr), 118 TRI(nullptr), 119 ExpInstrTypesSeen(0) { } 120 121 bool runOnMachineFunction(MachineFunction &MF) override; 122 123 const char *getPassName() const override { 124 return "SI insert wait instructions"; 125 } 126 127}; 128 129} // End anonymous namespace 130 131char SIInsertWaits::ID = 0; 132 133const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; 134const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; 135 136FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { 137 return new SIInsertWaits(tm); 138} 139 140Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { 141 142 uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; 143 Counters Result; 144 145 Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); 146 147 // Only consider stores or EXP for EXP_CNT 148 Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && 149 (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); 150 151 // LGKM may uses larger values 152 if (TSFlags & SIInstrFlags::LGKM_CNT) { 153 154 if (TII->isSMRD(MI.getOpcode())) { 155 156 MachineOperand &Op = MI.getOperand(0); 157 assert(Op.isReg() && "First LGKM operand must be a register!"); 158 159 unsigned Reg = Op.getReg(); 160 unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); 161 Result.Named.LGKM = Size > 4 ? 2 : 1; 162 163 } else { 164 // DS 165 Result.Named.LGKM = 1; 166 } 167 168 } else { 169 Result.Named.LGKM = 0; 170 } 171 172 return Result; 173} 174 175bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { 176 177 // Constants are always irrelevant 178 if (!Op.isReg()) 179 return false; 180 181 // Defines are always relevant 182 if (Op.isDef()) 183 return true; 184 185 // For exports all registers are relevant 186 MachineInstr &MI = *Op.getParent(); 187 if (MI.getOpcode() == AMDGPU::EXP) 188 return true; 189 190 // For stores the stored value is also relevant 191 if (!MI.getDesc().mayStore()) 192 return false; 193 194 // Check if this operand is the value being stored. 195 // Special case for DS instructions, since the address 196 // operand comes before the value operand and it may have 197 // multiple data operands. 198 199 if (TII->isDS(MI.getOpcode())) { 200 MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); 201 if (Data && Op.isIdenticalTo(*Data)) 202 return true; 203 204 MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); 205 if (Data0 && Op.isIdenticalTo(*Data0)) 206 return true; 207 208 MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); 209 if (Data1 && Op.isIdenticalTo(*Data1)) 210 return true; 211 212 return false; 213 } 214 215 // NOTE: This assumes that the value operand is before the 216 // address operand, and that there is only one value operand. 217 for (MachineInstr::mop_iterator I = MI.operands_begin(), 218 E = MI.operands_end(); I != E; ++I) { 219 220 if (I->isReg() && I->isUse()) 221 return Op.isIdenticalTo(*I); 222 } 223 224 return false; 225} 226 227RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { 228 229 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) 230 return std::make_pair(0, 0); 231 232 unsigned Reg = Op.getReg(); 233 unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); 234 235 assert(Size >= 4); 236 237 RegInterval Result; 238 Result.first = TRI->getEncodingValue(Reg); 239 Result.second = Result.first + Size / 4; 240 241 return Result; 242} 243 244void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, 245 MachineBasicBlock::iterator I) { 246 247 // Get the hardware counter increments and sum them up 248 Counters Increment = getHwCounts(*I); 249 unsigned Sum = 0; 250 251 for (unsigned i = 0; i < 3; ++i) { 252 LastIssued.Array[i] += Increment.Array[i]; 253 Sum += Increment.Array[i]; 254 } 255 256 // If we don't increase anything then that's it 257 if (Sum == 0) { 258 LastOpcodeType = OTHER; 259 return; 260 } 261 262 if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 263 AMDGPUSubtarget::VOLCANIC_ISLANDS) { 264 // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM 265 // or SMEM clause, respectively. 266 // 267 // The temporary workaround is to break the clauses with S_NOP. 268 // 269 // The proper solution would be to allocate registers such that all source 270 // and destination registers don't overlap, e.g. this is illegal: 271 // r0 = load r2 272 // r2 = load r0 273 if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || 274 (LastOpcodeType == VMEM && Increment.Named.VM)) { 275 // Insert a NOP to break the clause. 276 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) 277 .addImm(0); 278 LastInstWritesM0 = false; 279 } 280 281 if (TII->isSMRD(I->getOpcode())) 282 LastOpcodeType = SMEM; 283 else if (Increment.Named.VM) 284 LastOpcodeType = VMEM; 285 } 286 287 // Remember which export instructions we have seen 288 if (Increment.Named.EXP) { 289 ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; 290 } 291 292 for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { 293 294 MachineOperand &Op = I->getOperand(i); 295 if (!isOpRelevant(Op)) 296 continue; 297 298 RegInterval Interval = getRegInterval(Op); 299 for (unsigned j = Interval.first; j < Interval.second; ++j) { 300 301 // Remember which registers we define 302 if (Op.isDef()) 303 DefinedRegs[j] = LastIssued; 304 305 // and which one we are using 306 if (Op.isUse()) 307 UsedRegs[j] = LastIssued; 308 } 309 } 310} 311 312bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, 313 MachineBasicBlock::iterator I, 314 const Counters &Required) { 315 316 // End of program? No need to wait on anything 317 if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) 318 return false; 319 320 // Figure out if the async instructions execute in order 321 bool Ordered[3]; 322 323 // VM_CNT is always ordered 324 Ordered[0] = true; 325 326 // EXP_CNT is unordered if we have both EXP & VM-writes 327 Ordered[1] = ExpInstrTypesSeen == 3; 328 329 // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS 330 Ordered[2] = false; 331 332 // The values we are going to put into the S_WAITCNT instruction 333 Counters Counts = WaitCounts; 334 335 // Do we really need to wait? 336 bool NeedWait = false; 337 338 for (unsigned i = 0; i < 3; ++i) { 339 340 if (Required.Array[i] <= WaitedOn.Array[i]) 341 continue; 342 343 NeedWait = true; 344 345 if (Ordered[i]) { 346 unsigned Value = LastIssued.Array[i] - Required.Array[i]; 347 348 // Adjust the value to the real hardware possibilities. 349 Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); 350 351 } else 352 Counts.Array[i] = 0; 353 354 // Remember on what we have waited on. 355 WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; 356 } 357 358 if (!NeedWait) 359 return false; 360 361 // Reset EXP_CNT instruction types 362 if (Counts.Named.EXP == 0) 363 ExpInstrTypesSeen = 0; 364 365 // Build the wait instruction 366 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 367 .addImm((Counts.Named.VM & 0xF) | 368 ((Counts.Named.EXP & 0x7) << 4) | 369 ((Counts.Named.LGKM & 0x7) << 8)); 370 371 LastOpcodeType = OTHER; 372 LastInstWritesM0 = false; 373 return true; 374} 375 376/// \brief helper function for handleOperands 377static void increaseCounters(Counters &Dst, const Counters &Src) { 378 379 for (unsigned i = 0; i < 3; ++i) 380 Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); 381} 382 383Counters SIInsertWaits::handleOperands(MachineInstr &MI) { 384 385 Counters Result = ZeroCounts; 386 387 // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, 388 // but we also want to wait for any other outstanding transfers before 389 // signalling other hardware blocks 390 if (MI.getOpcode() == AMDGPU::S_SENDMSG) 391 return LastIssued; 392 393 // For each register affected by this 394 // instruction increase the result sequence 395 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 396 397 MachineOperand &Op = MI.getOperand(i); 398 RegInterval Interval = getRegInterval(Op); 399 for (unsigned j = Interval.first; j < Interval.second; ++j) { 400 401 if (Op.isDef()) { 402 increaseCounters(Result, UsedRegs[j]); 403 increaseCounters(Result, DefinedRegs[j]); 404 } 405 406 if (Op.isUse()) 407 increaseCounters(Result, DefinedRegs[j]); 408 } 409 } 410 411 return Result; 412} 413 414void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, 415 MachineBasicBlock::iterator I) { 416 if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < 417 AMDGPUSubtarget::VOLCANIC_ISLANDS) 418 return; 419 420 // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. 421 if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { 422 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); 423 LastInstWritesM0 = false; 424 return; 425 } 426 427 // Set whether this instruction sets M0 428 LastInstWritesM0 = false; 429 430 unsigned NumOperands = I->getNumOperands(); 431 for (unsigned i = 0; i < NumOperands; i++) { 432 const MachineOperand &Op = I->getOperand(i); 433 434 if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) 435 LastInstWritesM0 = true; 436 } 437} 438 439// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" 440// around other non-memory instructions. 441bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { 442 bool Changes = false; 443 444 TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 445 TRI = 446 static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); 447 448 MRI = &MF.getRegInfo(); 449 450 WaitedOn = ZeroCounts; 451 LastIssued = ZeroCounts; 452 LastOpcodeType = OTHER; 453 LastInstWritesM0 = false; 454 455 memset(&UsedRegs, 0, sizeof(UsedRegs)); 456 memset(&DefinedRegs, 0, sizeof(DefinedRegs)); 457 458 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 459 BI != BE; ++BI) { 460 461 MachineBasicBlock &MBB = *BI; 462 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 463 I != E; ++I) { 464 465 // Wait for everything before a barrier. 466 if (I->getOpcode() == AMDGPU::S_BARRIER) 467 Changes |= insertWait(MBB, I, LastIssued); 468 else 469 Changes |= insertWait(MBB, I, handleOperands(*I)); 470 471 pushInstruction(MBB, I); 472 handleSendMsg(MBB, I); 473 } 474 475 // Wait for everything at the end of the MBB 476 Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); 477 } 478 479 return Changes; 480} 481