1//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Insert wait instructions for memory reads and writes.
12///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/CodeGen/MachineFunction.h"
25#include "llvm/CodeGen/MachineFunctionPass.h"
26#include "llvm/CodeGen/MachineInstrBuilder.h"
27#include "llvm/CodeGen/MachineRegisterInfo.h"
28
29#define DEBUG_TYPE "si-insert-waits"
30
31using namespace llvm;
32
33namespace {
34
35/// \brief One variable for each of the hardware counters
36typedef union {
37  struct {
38    unsigned VM;
39    unsigned EXP;
40    unsigned LGKM;
41  } Named;
42  unsigned Array[3];
43
44} Counters;
45
46typedef enum {
47  OTHER,
48  SMEM,
49  VMEM
50} InstType;
51
52typedef Counters RegCounters[512];
53typedef std::pair<unsigned, unsigned> RegInterval;
54
55class SIInsertWaits : public MachineFunctionPass {
56
57private:
58  const SISubtarget *ST;
59  const SIInstrInfo *TII;
60  const SIRegisterInfo *TRI;
61  const MachineRegisterInfo *MRI;
62
63  /// \brief Constant hardware limits
64  static const Counters WaitCounts;
65
66  /// \brief Constant zero value
67  static const Counters ZeroCounts;
68
69  /// \brief Counter values we have already waited on.
70  Counters WaitedOn;
71
72  /// \brief Counter values that we must wait on before the next counter
73  /// increase.
74  Counters DelayedWaitOn;
75
76  /// \brief Counter values for last instruction issued.
77  Counters LastIssued;
78
79  /// \brief Registers used by async instructions.
80  RegCounters UsedRegs;
81
82  /// \brief Registers defined by async instructions.
83  RegCounters DefinedRegs;
84
85  /// \brief Different export instruction types seen since last wait.
86  unsigned ExpInstrTypesSeen;
87
88  /// \brief Type of the last opcode.
89  InstType LastOpcodeType;
90
91  bool LastInstWritesM0;
92
93  /// \brief Whether the machine function returns void
94  bool ReturnsVoid;
95
96  /// Whether the VCCZ bit is possibly corrupt
97  bool VCCZCorrupt;
98
99  /// \brief Get increment/decrement amount for this instruction.
100  Counters getHwCounts(MachineInstr &MI);
101
102  /// \brief Is operand relevant for async execution?
103  bool isOpRelevant(MachineOperand &Op);
104
105  /// \brief Get register interval an operand affects.
106  RegInterval getRegInterval(const TargetRegisterClass *RC,
107                             const MachineOperand &Reg) const;
108
109  /// \brief Handle instructions async components
110  void pushInstruction(MachineBasicBlock &MBB,
111                       MachineBasicBlock::iterator I,
112                       const Counters& Increment);
113
114  /// \brief Insert the actual wait instruction
115  bool insertWait(MachineBasicBlock &MBB,
116                  MachineBasicBlock::iterator I,
117                  const Counters &Counts);
118
119  /// \brief Handle existing wait instructions (from intrinsics)
120  void handleExistingWait(MachineBasicBlock::iterator I);
121
122  /// \brief Do we need def2def checks?
123  bool unorderedDefines(MachineInstr &MI);
124
125  /// \brief Resolve all operand dependencies to counter requirements
126  Counters handleOperands(MachineInstr &MI);
127
128  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
129  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
130
131  /// Return true if there are LGKM instrucitons that haven't been waited on
132  /// yet.
133  bool hasOutstandingLGKM() const;
134
135public:
136  static char ID;
137
138  SIInsertWaits() :
139    MachineFunctionPass(ID),
140    ST(nullptr),
141    TII(nullptr),
142    TRI(nullptr),
143    ExpInstrTypesSeen(0),
144    VCCZCorrupt(false) { }
145
146  bool runOnMachineFunction(MachineFunction &MF) override;
147
148  const char *getPassName() const override {
149    return "SI insert wait instructions";
150  }
151
152  void getAnalysisUsage(AnalysisUsage &AU) const override {
153    AU.setPreservesCFG();
154    MachineFunctionPass::getAnalysisUsage(AU);
155  }
156};
157
158} // End anonymous namespace
159
160INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
161                      "SI Insert Waits", false, false)
162INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
163                    "SI Insert Waits", false, false)
164
165char SIInsertWaits::ID = 0;
166
167char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
168
169FunctionPass *llvm::createSIInsertWaitsPass() {
170  return new SIInsertWaits();
171}
172
173const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
174const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
175
176static bool readsVCCZ(unsigned Opcode) {
177  return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
178}
179
180bool SIInsertWaits::hasOutstandingLGKM() const {
181  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
182}
183
184Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
185  uint64_t TSFlags = MI.getDesc().TSFlags;
186  Counters Result = { { 0, 0, 0 } };
187
188  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
189
190  // Only consider stores or EXP for EXP_CNT
191  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
192      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
193
194  // LGKM may uses larger values
195  if (TSFlags & SIInstrFlags::LGKM_CNT) {
196
197    if (TII->isSMRD(MI)) {
198
199      if (MI.getNumOperands() != 0) {
200        assert(MI.getOperand(0).isReg() &&
201               "First LGKM operand must be a register!");
202
203        // XXX - What if this is a write into a super register?
204        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
205        unsigned Size = RC->getSize();
206        Result.Named.LGKM = Size > 4 ? 2 : 1;
207      } else {
208        // s_dcache_inv etc. do not have a a destination register. Assume we
209        // want a wait on these.
210        // XXX - What is the right value?
211        Result.Named.LGKM = 1;
212      }
213    } else {
214      // DS
215      Result.Named.LGKM = 1;
216    }
217
218  } else {
219    Result.Named.LGKM = 0;
220  }
221
222  return Result;
223}
224
225bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
226  // Constants are always irrelevant
227  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
228    return false;
229
230  // Defines are always relevant
231  if (Op.isDef())
232    return true;
233
234  // For exports all registers are relevant
235  MachineInstr &MI = *Op.getParent();
236  if (MI.getOpcode() == AMDGPU::EXP)
237    return true;
238
239  // For stores the stored value is also relevant
240  if (!MI.getDesc().mayStore())
241    return false;
242
243  // Check if this operand is the value being stored.
244  // Special case for DS/FLAT instructions, since the address
245  // operand comes before the value operand and it may have
246  // multiple data operands.
247
248  if (TII->isDS(MI) || TII->isFLAT(MI)) {
249    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
250    if (Data && Op.isIdenticalTo(*Data))
251      return true;
252  }
253
254  if (TII->isDS(MI)) {
255    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
256    if (Data0 && Op.isIdenticalTo(*Data0))
257      return true;
258
259    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
260    return Data1 && Op.isIdenticalTo(*Data1);
261  }
262
263  // NOTE: This assumes that the value operand is before the
264  // address operand, and that there is only one value operand.
265  for (MachineInstr::mop_iterator I = MI.operands_begin(),
266       E = MI.operands_end(); I != E; ++I) {
267
268    if (I->isReg() && I->isUse())
269      return Op.isIdenticalTo(*I);
270  }
271
272  return false;
273}
274
275RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
276                                          const MachineOperand &Reg) const {
277  unsigned Size = RC->getSize();
278  assert(Size >= 4);
279
280  RegInterval Result;
281  Result.first = TRI->getEncodingValue(Reg.getReg());
282  Result.second = Result.first + Size / 4;
283
284  return Result;
285}
286
287void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
288                                    MachineBasicBlock::iterator I,
289                                    const Counters &Increment) {
290
291  // Get the hardware counter increments and sum them up
292  Counters Limit = ZeroCounts;
293  unsigned Sum = 0;
294
295  for (unsigned i = 0; i < 3; ++i) {
296    LastIssued.Array[i] += Increment.Array[i];
297    if (Increment.Array[i])
298      Limit.Array[i] = LastIssued.Array[i];
299    Sum += Increment.Array[i];
300  }
301
302  // If we don't increase anything then that's it
303  if (Sum == 0) {
304    LastOpcodeType = OTHER;
305    return;
306  }
307
308  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
309    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
310    // or SMEM clause, respectively.
311    //
312    // The temporary workaround is to break the clauses with S_NOP.
313    //
314    // The proper solution would be to allocate registers such that all source
315    // and destination registers don't overlap, e.g. this is illegal:
316    //   r0 = load r2
317    //   r2 = load r0
318    if (LastOpcodeType == VMEM && Increment.Named.VM) {
319      // Insert a NOP to break the clause.
320      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
321          .addImm(0);
322      LastInstWritesM0 = false;
323    }
324
325    if (TII->isSMRD(*I))
326      LastOpcodeType = SMEM;
327    else if (Increment.Named.VM)
328      LastOpcodeType = VMEM;
329  }
330
331  // Remember which export instructions we have seen
332  if (Increment.Named.EXP) {
333    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
334  }
335
336  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
337    MachineOperand &Op = I->getOperand(i);
338    if (!isOpRelevant(Op))
339      continue;
340
341    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
342    RegInterval Interval = getRegInterval(RC, Op);
343    for (unsigned j = Interval.first; j < Interval.second; ++j) {
344
345      // Remember which registers we define
346      if (Op.isDef())
347        DefinedRegs[j] = Limit;
348
349      // and which one we are using
350      if (Op.isUse())
351        UsedRegs[j] = Limit;
352    }
353  }
354}
355
356bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
357                               MachineBasicBlock::iterator I,
358                               const Counters &Required) {
359
360  // End of program? No need to wait on anything
361  // A function not returning void needs to wait, because other bytecode will
362  // be appended after it and we don't know what it will be.
363  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
364    return false;
365
366  // Figure out if the async instructions execute in order
367  bool Ordered[3];
368
369  // VM_CNT is always ordered
370  Ordered[0] = true;
371
372  // EXP_CNT is unordered if we have both EXP & VM-writes
373  Ordered[1] = ExpInstrTypesSeen == 3;
374
375  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
376  Ordered[2] = false;
377
378  // The values we are going to put into the S_WAITCNT instruction
379  Counters Counts = WaitCounts;
380
381  // Do we really need to wait?
382  bool NeedWait = false;
383
384  for (unsigned i = 0; i < 3; ++i) {
385
386    if (Required.Array[i] <= WaitedOn.Array[i])
387      continue;
388
389    NeedWait = true;
390
391    if (Ordered[i]) {
392      unsigned Value = LastIssued.Array[i] - Required.Array[i];
393
394      // Adjust the value to the real hardware possibilities.
395      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
396
397    } else
398      Counts.Array[i] = 0;
399
400    // Remember on what we have waited on.
401    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
402  }
403
404  if (!NeedWait)
405    return false;
406
407  // Reset EXP_CNT instruction types
408  if (Counts.Named.EXP == 0)
409    ExpInstrTypesSeen = 0;
410
411  // Build the wait instruction
412  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
413          .addImm((Counts.Named.VM & 0xF) |
414                  ((Counts.Named.EXP & 0x7) << 4) |
415                  ((Counts.Named.LGKM & 0xF) << 8));
416
417  LastOpcodeType = OTHER;
418  LastInstWritesM0 = false;
419  return true;
420}
421
422/// \brief helper function for handleOperands
423static void increaseCounters(Counters &Dst, const Counters &Src) {
424
425  for (unsigned i = 0; i < 3; ++i)
426    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
427}
428
429/// \brief check whether any of the counters is non-zero
430static bool countersNonZero(const Counters &Counter) {
431  for (unsigned i = 0; i < 3; ++i)
432    if (Counter.Array[i])
433      return true;
434  return false;
435}
436
437void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
438  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
439
440  unsigned Imm = I->getOperand(0).getImm();
441  Counters Counts, WaitOn;
442
443  Counts.Named.VM = Imm & 0xF;
444  Counts.Named.EXP = (Imm >> 4) & 0x7;
445  Counts.Named.LGKM = (Imm >> 8) & 0xF;
446
447  for (unsigned i = 0; i < 3; ++i) {
448    if (Counts.Array[i] <= LastIssued.Array[i])
449      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
450    else
451      WaitOn.Array[i] = 0;
452  }
453
454  increaseCounters(DelayedWaitOn, WaitOn);
455}
456
457Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
458
459  Counters Result = ZeroCounts;
460
461  // For each register affected by this instruction increase the result
462  // sequence.
463  //
464  // TODO: We could probably just look at explicit operands if we removed VCC /
465  // EXEC from SMRD dest reg classes.
466  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
467    MachineOperand &Op = MI.getOperand(i);
468    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
469      continue;
470
471    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
472    RegInterval Interval = getRegInterval(RC, Op);
473    for (unsigned j = Interval.first; j < Interval.second; ++j) {
474
475      if (Op.isDef()) {
476        increaseCounters(Result, UsedRegs[j]);
477        increaseCounters(Result, DefinedRegs[j]);
478      }
479
480      if (Op.isUse())
481        increaseCounters(Result, DefinedRegs[j]);
482    }
483  }
484
485  return Result;
486}
487
488void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
489                                  MachineBasicBlock::iterator I) {
490  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
491    return;
492
493  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
494  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
495    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
496    LastInstWritesM0 = false;
497    return;
498  }
499
500  // Set whether this instruction sets M0
501  LastInstWritesM0 = false;
502
503  unsigned NumOperands = I->getNumOperands();
504  for (unsigned i = 0; i < NumOperands; i++) {
505    const MachineOperand &Op = I->getOperand(i);
506
507    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
508      LastInstWritesM0 = true;
509  }
510}
511
512// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
513// around other non-memory instructions.
514bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
515  bool Changes = false;
516
517  ST = &MF.getSubtarget<SISubtarget>();
518  TII = ST->getInstrInfo();
519  TRI = &TII->getRegisterInfo();
520  MRI = &MF.getRegInfo();
521
522  WaitedOn = ZeroCounts;
523  DelayedWaitOn = ZeroCounts;
524  LastIssued = ZeroCounts;
525  LastOpcodeType = OTHER;
526  LastInstWritesM0 = false;
527  ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
528
529  memset(&UsedRegs, 0, sizeof(UsedRegs));
530  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
531
532  SmallVector<MachineInstr *, 4> RemoveMI;
533
534  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
535       BI != BE; ++BI) {
536
537    MachineBasicBlock &MBB = *BI;
538    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
539         I != E; ++I) {
540
541      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
542        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
543        // vccz bit, so when we detect that an instruction may read from a
544        // corrupt vccz bit, we need to:
545        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
546        //    complete.
547        // 2. Restore the correct value of vccz by writing the current value
548        //    of vcc back to vcc.
549
550        if (TII->isSMRD(I->getOpcode())) {
551          VCCZCorrupt = true;
552        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
553          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
554          // Whenever we store a value in vcc, the correct value of vccz is
555          // restored.
556          VCCZCorrupt = false;
557        }
558
559        // Check if we need to apply the bug work-around
560        if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
561          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
562
563          // Wait on everything, not just LGKM.  vccz reads usually come from
564          // terminators, and we always wait on everything at the end of the
565          // block, so if we only wait on LGKM here, we might end up with
566          // another s_waitcnt inserted right after this if there are non-LGKM
567          // instructions still outstanding.
568          insertWait(MBB, I, LastIssued);
569
570          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
571          // bit is updated, so we can restore the bit by reading the value of
572          // vcc and then writing it back to the register.
573          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
574                  AMDGPU::VCC)
575                  .addReg(AMDGPU::VCC);
576        }
577      }
578
579      // Record pre-existing, explicitly requested waits
580      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
581        handleExistingWait(*I);
582        RemoveMI.push_back(&*I);
583        continue;
584      }
585
586      Counters Required;
587
588      // Wait for everything before a barrier.
589      //
590      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
591      // but we also want to wait for any other outstanding transfers before
592      // signalling other hardware blocks
593      if (I->getOpcode() == AMDGPU::S_BARRIER ||
594          I->getOpcode() == AMDGPU::S_SENDMSG)
595        Required = LastIssued;
596      else
597        Required = handleOperands(*I);
598
599      Counters Increment = getHwCounts(*I);
600
601      if (countersNonZero(Required) || countersNonZero(Increment))
602        increaseCounters(Required, DelayedWaitOn);
603
604      Changes |= insertWait(MBB, I, Required);
605
606      pushInstruction(MBB, I, Increment);
607      handleSendMsg(MBB, I);
608    }
609
610    // Wait for everything at the end of the MBB
611    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
612  }
613
614  for (MachineInstr *I : RemoveMI)
615    I->eraseFromParent();
616
617  return Changes;
618}
619