1//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This pass lowers the pseudo control flow instructions to real
12/// machine instructions.
13///
14/// All control flow is handled using predicated instructions and
15/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17/// by writting to the 64-bit EXEC register (each bit corresponds to a
18/// single vector ALU).  Typically, for predicates, a vector ALU will write
19/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20/// Vector ALU) and then the ScalarALU will AND the VCC register with the
21/// EXEC to update the predicates.
22///
23/// For example:
24/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25/// %SGPR0 = SI_IF %VCC
26///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27/// %SGPR0 = SI_ELSE %SGPR0
28///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29/// SI_END_CF %SGPR0
30///
31/// becomes:
32///
33/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35/// S_CBRANCH_EXECZ label0            // This instruction is an optional
36///                                   // optimization which allows us to
37///                                   // branch if all the bits of
38///                                   // EXEC are zero.
39/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40///
41/// label0:
42/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44/// S_BRANCH_EXECZ label1              // Use our branch optimization
45///                                    // instruction again.
46/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47/// label1:
48/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49//===----------------------------------------------------------------------===//
50
51#include "AMDGPU.h"
52#include "AMDGPUSubtarget.h"
53#include "SIInstrInfo.h"
54#include "SIMachineFunctionInfo.h"
55#include "llvm/CodeGen/MachineFrameInfo.h"
56#include "llvm/CodeGen/MachineFunction.h"
57#include "llvm/CodeGen/MachineFunctionPass.h"
58#include "llvm/CodeGen/MachineInstrBuilder.h"
59#include "llvm/CodeGen/MachineRegisterInfo.h"
60#include "llvm/IR/Constants.h"
61
62using namespace llvm;
63
64namespace {
65
66class SILowerControlFlowPass : public MachineFunctionPass {
67
68private:
69  static const unsigned SkipThreshold = 12;
70
71  static char ID;
72  const SIRegisterInfo *TRI;
73  const SIInstrInfo *TII;
74
75  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
76
77  void Skip(MachineInstr &From, MachineOperand &To);
78  void SkipIfDead(MachineInstr &MI);
79
80  void If(MachineInstr &MI);
81  void Else(MachineInstr &MI);
82  void Break(MachineInstr &MI);
83  void IfBreak(MachineInstr &MI);
84  void ElseBreak(MachineInstr &MI);
85  void Loop(MachineInstr &MI);
86  void EndCf(MachineInstr &MI);
87
88  void Kill(MachineInstr &MI);
89  void Branch(MachineInstr &MI);
90
91  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
92  void IndirectSrc(MachineInstr &MI);
93  void IndirectDst(MachineInstr &MI);
94
95public:
96  SILowerControlFlowPass(TargetMachine &tm) :
97    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
98
99  bool runOnMachineFunction(MachineFunction &MF) override;
100
101  const char *getPassName() const override {
102    return "SI Lower control flow instructions";
103  }
104
105};
106
107} // End anonymous namespace
108
109char SILowerControlFlowPass::ID = 0;
110
111FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
112  return new SILowerControlFlowPass(tm);
113}
114
115bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
116                                        MachineBasicBlock *To) {
117
118  unsigned NumInstr = 0;
119
120  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
121       MBB = *MBB->succ_begin()) {
122
123    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
124         NumInstr < SkipThreshold && I != E; ++I) {
125
126      if (I->isBundle() || !I->isBundled())
127        if (++NumInstr >= SkipThreshold)
128          return true;
129    }
130  }
131
132  return false;
133}
134
135void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
136
137  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
138    return;
139
140  DebugLoc DL = From.getDebugLoc();
141  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
142          .addOperand(To)
143          .addReg(AMDGPU::EXEC);
144}
145
146void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
147
148  MachineBasicBlock &MBB = *MI.getParent();
149  DebugLoc DL = MI.getDebugLoc();
150
151  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
152      ShaderType::PIXEL ||
153      !shouldSkip(&MBB, &MBB.getParent()->back()))
154    return;
155
156  MachineBasicBlock::iterator Insert = &MI;
157  ++Insert;
158
159  // If the exec mask is non-zero, skip the next two instructions
160  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
161          .addImm(3)
162          .addReg(AMDGPU::EXEC);
163
164  // Exec mask is zero: Export to NULL target...
165  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
166          .addImm(0)
167          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
168          .addImm(0)
169          .addImm(1)
170          .addImm(1)
171          .addReg(AMDGPU::VGPR0)
172          .addReg(AMDGPU::VGPR0)
173          .addReg(AMDGPU::VGPR0)
174          .addReg(AMDGPU::VGPR0);
175
176  // ... and terminate wavefront
177  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
178}
179
180void SILowerControlFlowPass::If(MachineInstr &MI) {
181  MachineBasicBlock &MBB = *MI.getParent();
182  DebugLoc DL = MI.getDebugLoc();
183  unsigned Reg = MI.getOperand(0).getReg();
184  unsigned Vcc = MI.getOperand(1).getReg();
185
186  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
187          .addReg(Vcc);
188
189  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
190          .addReg(AMDGPU::EXEC)
191          .addReg(Reg);
192
193  Skip(MI, MI.getOperand(2));
194
195  MI.eraseFromParent();
196}
197
198void SILowerControlFlowPass::Else(MachineInstr &MI) {
199  MachineBasicBlock &MBB = *MI.getParent();
200  DebugLoc DL = MI.getDebugLoc();
201  unsigned Dst = MI.getOperand(0).getReg();
202  unsigned Src = MI.getOperand(1).getReg();
203
204  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
205          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
206          .addReg(Src); // Saved EXEC
207
208  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
209          .addReg(AMDGPU::EXEC)
210          .addReg(Dst);
211
212  Skip(MI, MI.getOperand(2));
213
214  MI.eraseFromParent();
215}
216
217void SILowerControlFlowPass::Break(MachineInstr &MI) {
218  MachineBasicBlock &MBB = *MI.getParent();
219  DebugLoc DL = MI.getDebugLoc();
220
221  unsigned Dst = MI.getOperand(0).getReg();
222  unsigned Src = MI.getOperand(1).getReg();
223
224  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
225          .addReg(AMDGPU::EXEC)
226          .addReg(Src);
227
228  MI.eraseFromParent();
229}
230
231void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
232  MachineBasicBlock &MBB = *MI.getParent();
233  DebugLoc DL = MI.getDebugLoc();
234
235  unsigned Dst = MI.getOperand(0).getReg();
236  unsigned Vcc = MI.getOperand(1).getReg();
237  unsigned Src = MI.getOperand(2).getReg();
238
239  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
240          .addReg(Vcc)
241          .addReg(Src);
242
243  MI.eraseFromParent();
244}
245
246void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
247  MachineBasicBlock &MBB = *MI.getParent();
248  DebugLoc DL = MI.getDebugLoc();
249
250  unsigned Dst = MI.getOperand(0).getReg();
251  unsigned Saved = MI.getOperand(1).getReg();
252  unsigned Src = MI.getOperand(2).getReg();
253
254  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
255          .addReg(Saved)
256          .addReg(Src);
257
258  MI.eraseFromParent();
259}
260
261void SILowerControlFlowPass::Loop(MachineInstr &MI) {
262  MachineBasicBlock &MBB = *MI.getParent();
263  DebugLoc DL = MI.getDebugLoc();
264  unsigned Src = MI.getOperand(0).getReg();
265
266  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
267          .addReg(AMDGPU::EXEC)
268          .addReg(Src);
269
270  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
271          .addOperand(MI.getOperand(1))
272          .addReg(AMDGPU::EXEC);
273
274  MI.eraseFromParent();
275}
276
277void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
278  MachineBasicBlock &MBB = *MI.getParent();
279  DebugLoc DL = MI.getDebugLoc();
280  unsigned Reg = MI.getOperand(0).getReg();
281
282  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
283          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
284          .addReg(AMDGPU::EXEC)
285          .addReg(Reg);
286
287  MI.eraseFromParent();
288}
289
290void SILowerControlFlowPass::Branch(MachineInstr &MI) {
291  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
292    MI.eraseFromParent();
293
294  // If these aren't equal, this is probably an infinite loop.
295}
296
297void SILowerControlFlowPass::Kill(MachineInstr &MI) {
298  MachineBasicBlock &MBB = *MI.getParent();
299  DebugLoc DL = MI.getDebugLoc();
300  const MachineOperand &Op = MI.getOperand(0);
301
302#ifndef NDEBUG
303  const SIMachineFunctionInfo *MFI
304    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
305  // Kill is only allowed in pixel / geometry shaders.
306  assert(MFI->getShaderType() == ShaderType::PIXEL ||
307         MFI->getShaderType() == ShaderType::GEOMETRY);
308#endif
309
310  // Clear this thread from the exec mask if the operand is negative
311  if ((Op.isImm())) {
312    // Constant operand: Set exec mask to 0 or do nothing
313    if (Op.getImm() & 0x80000000) {
314      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
315              .addImm(0);
316    }
317  } else {
318    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
319           .addImm(0)
320           .addOperand(Op);
321  }
322
323  MI.eraseFromParent();
324}
325
326void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
327
328  MachineBasicBlock &MBB = *MI.getParent();
329  DebugLoc DL = MI.getDebugLoc();
330  MachineBasicBlock::iterator I = MI;
331
332  unsigned Save = MI.getOperand(1).getReg();
333  unsigned Idx = MI.getOperand(3).getReg();
334
335  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
336    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
337            .addReg(Idx);
338    MBB.insert(I, MovRel);
339  } else {
340
341    assert(AMDGPU::SReg_64RegClass.contains(Save));
342    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
343
344    // Save the EXEC mask
345    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
346            .addReg(AMDGPU::EXEC);
347
348    // Read the next variant into VCC (lower 32 bits) <- also loop target
349    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
350            AMDGPU::VCC_LO)
351            .addReg(Idx);
352
353    // Move index from VCC into M0
354    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
355            .addReg(AMDGPU::VCC_LO);
356
357    // Compare the just read M0 value to all possible Idx values
358    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
359            .addReg(AMDGPU::M0)
360            .addReg(Idx);
361
362    // Update EXEC, save the original EXEC value to VCC
363    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
364            .addReg(AMDGPU::VCC);
365
366    // Do the actual move
367    MBB.insert(I, MovRel);
368
369    // Update EXEC, switch all done bits to 0 and all todo bits to 1
370    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
371            .addReg(AMDGPU::EXEC)
372            .addReg(AMDGPU::VCC);
373
374    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
375    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
376            .addImm(-7)
377            .addReg(AMDGPU::EXEC);
378
379    // Restore EXEC
380    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
381            .addReg(Save);
382
383  }
384  MI.eraseFromParent();
385}
386
387void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
388
389  MachineBasicBlock &MBB = *MI.getParent();
390  DebugLoc DL = MI.getDebugLoc();
391
392  unsigned Dst = MI.getOperand(0).getReg();
393  unsigned Vec = MI.getOperand(2).getReg();
394  unsigned Off = MI.getOperand(4).getImm();
395  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
396  if (!SubReg)
397    SubReg = Vec;
398
399  MachineInstr *MovRel =
400    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
401            .addReg(SubReg + Off)
402            .addReg(AMDGPU::M0, RegState::Implicit)
403            .addReg(Vec, RegState::Implicit);
404
405  LoadM0(MI, MovRel);
406}
407
408void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
409
410  MachineBasicBlock &MBB = *MI.getParent();
411  DebugLoc DL = MI.getDebugLoc();
412
413  unsigned Dst = MI.getOperand(0).getReg();
414  unsigned Off = MI.getOperand(4).getImm();
415  unsigned Val = MI.getOperand(5).getReg();
416  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
417  if (!SubReg)
418    SubReg = Dst;
419
420  MachineInstr *MovRel =
421    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
422            .addReg(SubReg + Off, RegState::Define)
423            .addReg(Val)
424            .addReg(AMDGPU::M0, RegState::Implicit)
425            .addReg(Dst, RegState::Implicit);
426
427  LoadM0(MI, MovRel);
428}
429
430bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
431  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
432  TRI =
433      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
434  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
435
436  bool HaveKill = false;
437  bool NeedWQM = false;
438  bool NeedFlat = false;
439  unsigned Depth = 0;
440
441  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
442       BI != BE; ++BI) {
443
444    MachineBasicBlock &MBB = *BI;
445    MachineBasicBlock::iterator I, Next;
446    for (I = MBB.begin(); I != MBB.end(); I = Next) {
447      Next = std::next(I);
448
449      MachineInstr &MI = *I;
450      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
451        NeedWQM = true;
452
453      // Flat uses m0 in case it needs to access LDS.
454      if (TII->isFLAT(MI.getOpcode()))
455        NeedFlat = true;
456
457      switch (MI.getOpcode()) {
458        default: break;
459        case AMDGPU::SI_IF:
460          ++Depth;
461          If(MI);
462          break;
463
464        case AMDGPU::SI_ELSE:
465          Else(MI);
466          break;
467
468        case AMDGPU::SI_BREAK:
469          Break(MI);
470          break;
471
472        case AMDGPU::SI_IF_BREAK:
473          IfBreak(MI);
474          break;
475
476        case AMDGPU::SI_ELSE_BREAK:
477          ElseBreak(MI);
478          break;
479
480        case AMDGPU::SI_LOOP:
481          ++Depth;
482          Loop(MI);
483          break;
484
485        case AMDGPU::SI_END_CF:
486          if (--Depth == 0 && HaveKill) {
487            SkipIfDead(MI);
488            HaveKill = false;
489          }
490          EndCf(MI);
491          break;
492
493        case AMDGPU::SI_KILL:
494          if (Depth == 0)
495            SkipIfDead(MI);
496          else
497            HaveKill = true;
498          Kill(MI);
499          break;
500
501        case AMDGPU::S_BRANCH:
502          Branch(MI);
503          break;
504
505        case AMDGPU::SI_INDIRECT_SRC:
506          IndirectSrc(MI);
507          break;
508
509        case AMDGPU::SI_INDIRECT_DST_V1:
510        case AMDGPU::SI_INDIRECT_DST_V2:
511        case AMDGPU::SI_INDIRECT_DST_V4:
512        case AMDGPU::SI_INDIRECT_DST_V8:
513        case AMDGPU::SI_INDIRECT_DST_V16:
514          IndirectDst(MI);
515          break;
516      }
517    }
518  }
519
520  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
521    MachineBasicBlock &MBB = MF.front();
522    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
523            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
524  }
525
526  // FIXME: This seems inappropriate to do here.
527  if (NeedFlat && MFI->IsKernel) {
528    // Insert the prologue initializing the SGPRs pointing to the scratch space
529    // for flat accesses.
530    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
531
532    // TODO: What to use with function calls?
533
534    // FIXME: This is reporting stack size that is used in a scratch buffer
535    // rather than registers as well.
536    uint64_t StackSizeBytes = FrameInfo->getStackSize();
537
538    int IndirectBegin
539      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
540    // Convert register index to 256-byte unit.
541    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
542
543    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
544           "Stack limits should be smaller than 16-bits");
545
546    // Initialize the flat scratch register pair.
547    // TODO: Can we use one s_mov_b64 here?
548
549    // Offset is in units of 256-bytes.
550    MachineBasicBlock &MBB = MF.front();
551    DebugLoc NoDL;
552    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
553    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
554
555    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
556
557    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
558      .addImm(StackOffset);
559
560    // Documentation says size is "per-thread scratch size in bytes"
561    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
562      .addImm(StackSizeBytes);
563  }
564
565  return true;
566}
567