1//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This pass lowers the pseudo control flow instructions to real
12/// machine instructions.
13///
14/// All control flow is handled using predicated instructions and
15/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17/// by writting to the 64-bit EXEC register (each bit corresponds to a
18/// single vector ALU).  Typically, for predicates, a vector ALU will write
19/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20/// Vector ALU) and then the ScalarALU will AND the VCC register with the
21/// EXEC to update the predicates.
22///
23/// For example:
24/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25/// %SGPR0 = SI_IF %VCC
26///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27/// %SGPR0 = SI_ELSE %SGPR0
28///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29/// SI_END_CF %SGPR0
30///
31/// becomes:
32///
33/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35/// S_CBRANCH_EXECZ label0            // This instruction is an optional
36///                                   // optimization which allows us to
37///                                   // branch if all the bits of
38///                                   // EXEC are zero.
39/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40///
41/// label0:
42/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44/// S_BRANCH_EXECZ label1              // Use our branch optimization
45///                                    // instruction again.
46/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47/// label1:
48/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49//===----------------------------------------------------------------------===//
50
51#include "AMDGPU.h"
52#include "AMDGPUSubtarget.h"
53#include "SIInstrInfo.h"
54#include "SIMachineFunctionInfo.h"
55#include "llvm/CodeGen/LivePhysRegs.h"
56#include "llvm/CodeGen/MachineFrameInfo.h"
57#include "llvm/CodeGen/MachineFunction.h"
58#include "llvm/CodeGen/MachineFunctionPass.h"
59#include "llvm/CodeGen/MachineInstrBuilder.h"
60#include "llvm/CodeGen/MachineRegisterInfo.h"
61#include "llvm/IR/Constants.h"
62
63using namespace llvm;
64
65#define DEBUG_TYPE "si-lower-control-flow"
66
67namespace {
68
69class SILowerControlFlow : public MachineFunctionPass {
70private:
71  static const unsigned SkipThreshold = 12;
72
73  const SIRegisterInfo *TRI;
74  const SIInstrInfo *TII;
75
76  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77
78  void Skip(MachineInstr &From, MachineOperand &To);
79  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
80
81  void If(MachineInstr &MI);
82  void Else(MachineInstr &MI, bool ExecModified);
83  void Break(MachineInstr &MI);
84  void IfBreak(MachineInstr &MI);
85  void ElseBreak(MachineInstr &MI);
86  void Loop(MachineInstr &MI);
87  void EndCf(MachineInstr &MI);
88
89  void Kill(MachineInstr &MI);
90  void Branch(MachineInstr &MI);
91
92  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
93                                     MachineBasicBlock::iterator I) const;
94
95  std::pair<MachineBasicBlock *, MachineBasicBlock *>
96  splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
97
98  void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
99                               const MachineRegisterInfo &MRI,
100                               const MachineInstr &MI,
101                               MachineBasicBlock &LoopBB,
102                               MachineBasicBlock &RemainderBB,
103                               unsigned SaveReg,
104                               const MachineOperand &IdxReg);
105
106  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
107                              MachineInstr *MovRel,
108                              const MachineOperand &IdxReg,
109                              int Offset);
110
111  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
112  std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
113                                                       int Offset) const;
114  bool indirectSrc(MachineInstr &MI);
115  bool indirectDst(MachineInstr &MI);
116
117public:
118  static char ID;
119
120  SILowerControlFlow() :
121    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
122
123  bool runOnMachineFunction(MachineFunction &MF) override;
124
125  const char *getPassName() const override {
126    return "SI Lower control flow pseudo instructions";
127  }
128};
129
130} // End anonymous namespace
131
132char SILowerControlFlow::ID = 0;
133
134INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
135                "SI lower control flow", false, false)
136
137char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
138
139
140FunctionPass *llvm::createSILowerControlFlowPass() {
141  return new SILowerControlFlow();
142}
143
144static bool opcodeEmitsNoInsts(unsigned Opc) {
145  switch (Opc) {
146  case TargetOpcode::IMPLICIT_DEF:
147  case TargetOpcode::KILL:
148  case TargetOpcode::BUNDLE:
149  case TargetOpcode::CFI_INSTRUCTION:
150  case TargetOpcode::EH_LABEL:
151  case TargetOpcode::GC_LABEL:
152  case TargetOpcode::DBG_VALUE:
153    return true;
154  default:
155    return false;
156  }
157}
158
159bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
160                                    MachineBasicBlock *To) {
161
162  unsigned NumInstr = 0;
163  MachineFunction *MF = From->getParent();
164
165  for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
166       MBBI != End && MBBI != ToI; ++MBBI) {
167    MachineBasicBlock &MBB = *MBBI;
168
169    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
170         NumInstr < SkipThreshold && I != E; ++I) {
171      if (opcodeEmitsNoInsts(I->getOpcode()))
172        continue;
173
174      // When a uniform loop is inside non-uniform control flow, the branch
175      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
176      // when EXEC = 0. We should skip the loop lest it becomes infinite.
177      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
178          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
179        return true;
180
181      if (I->isInlineAsm()) {
182        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
183        const char *AsmStr = I->getOperand(0).getSymbolName();
184
185        // inlineasm length estimate is number of bytes assuming the longest
186        // instruction.
187        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
188        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
189      } else {
190        ++NumInstr;
191      }
192
193      if (NumInstr >= SkipThreshold)
194        return true;
195    }
196  }
197
198  return false;
199}
200
201void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
202
203  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
204    return;
205
206  DebugLoc DL = From.getDebugLoc();
207  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
208    .addOperand(To);
209}
210
211bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
212  MachineBasicBlock &MBB = *MI.getParent();
213  MachineFunction *MF = MBB.getParent();
214
215  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
216      !shouldSkip(&MBB, &MBB.getParent()->back()))
217    return false;
218
219  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
220  SkipBB->addSuccessor(&NextBB);
221
222  const DebugLoc &DL = MI.getDebugLoc();
223
224  // If the exec mask is non-zero, skip the next two instructions
225  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
226    .addMBB(&NextBB);
227
228  MachineBasicBlock::iterator Insert = SkipBB->begin();
229
230  // Exec mask is zero: Export to NULL target...
231  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
232    .addImm(0)
233    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
234    .addImm(0)
235    .addImm(1)
236    .addImm(1)
237    .addReg(AMDGPU::VGPR0, RegState::Undef)
238    .addReg(AMDGPU::VGPR0, RegState::Undef)
239    .addReg(AMDGPU::VGPR0, RegState::Undef)
240    .addReg(AMDGPU::VGPR0, RegState::Undef);
241
242  // ... and terminate wavefront.
243  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
244
245  return true;
246}
247
248void SILowerControlFlow::If(MachineInstr &MI) {
249  MachineBasicBlock &MBB = *MI.getParent();
250  DebugLoc DL = MI.getDebugLoc();
251  unsigned Reg = MI.getOperand(0).getReg();
252  unsigned Vcc = MI.getOperand(1).getReg();
253
254  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
255          .addReg(Vcc);
256
257  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
258          .addReg(AMDGPU::EXEC)
259          .addReg(Reg);
260
261  Skip(MI, MI.getOperand(2));
262
263  // Insert a pseudo terminator to help keep the verifier happy.
264  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
265    .addOperand(MI.getOperand(2))
266    .addReg(Reg);
267
268  MI.eraseFromParent();
269}
270
271void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
272  MachineBasicBlock &MBB = *MI.getParent();
273  DebugLoc DL = MI.getDebugLoc();
274  unsigned Dst = MI.getOperand(0).getReg();
275  unsigned Src = MI.getOperand(1).getReg();
276
277  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
278          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
279          .addReg(Src); // Saved EXEC
280
281  if (ExecModified) {
282    // Adjust the saved exec to account for the modifications during the flow
283    // block that contains the ELSE. This can happen when WQM mode is switched
284    // off.
285    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
286            .addReg(AMDGPU::EXEC)
287            .addReg(Dst);
288  }
289
290  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
291          .addReg(AMDGPU::EXEC)
292          .addReg(Dst);
293
294  Skip(MI, MI.getOperand(2));
295
296  // Insert a pseudo terminator to help keep the verifier happy.
297  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
298    .addOperand(MI.getOperand(2))
299    .addReg(Dst);
300
301  MI.eraseFromParent();
302}
303
304void SILowerControlFlow::Break(MachineInstr &MI) {
305  MachineBasicBlock &MBB = *MI.getParent();
306  DebugLoc DL = MI.getDebugLoc();
307
308  unsigned Dst = MI.getOperand(0).getReg();
309  unsigned Src = MI.getOperand(1).getReg();
310
311  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
312          .addReg(AMDGPU::EXEC)
313          .addReg(Src);
314
315  MI.eraseFromParent();
316}
317
318void SILowerControlFlow::IfBreak(MachineInstr &MI) {
319  MachineBasicBlock &MBB = *MI.getParent();
320  DebugLoc DL = MI.getDebugLoc();
321
322  unsigned Dst = MI.getOperand(0).getReg();
323  unsigned Vcc = MI.getOperand(1).getReg();
324  unsigned Src = MI.getOperand(2).getReg();
325
326  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
327          .addReg(Vcc)
328          .addReg(Src);
329
330  MI.eraseFromParent();
331}
332
333void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
334  MachineBasicBlock &MBB = *MI.getParent();
335  DebugLoc DL = MI.getDebugLoc();
336
337  unsigned Dst = MI.getOperand(0).getReg();
338  unsigned Saved = MI.getOperand(1).getReg();
339  unsigned Src = MI.getOperand(2).getReg();
340
341  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
342          .addReg(Saved)
343          .addReg(Src);
344
345  MI.eraseFromParent();
346}
347
348void SILowerControlFlow::Loop(MachineInstr &MI) {
349  MachineBasicBlock &MBB = *MI.getParent();
350  DebugLoc DL = MI.getDebugLoc();
351  unsigned Src = MI.getOperand(0).getReg();
352
353  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
354          .addReg(AMDGPU::EXEC)
355          .addReg(Src);
356
357  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
358    .addOperand(MI.getOperand(1));
359
360  MI.eraseFromParent();
361}
362
363void SILowerControlFlow::EndCf(MachineInstr &MI) {
364  MachineBasicBlock &MBB = *MI.getParent();
365  DebugLoc DL = MI.getDebugLoc();
366  unsigned Reg = MI.getOperand(0).getReg();
367
368  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
369          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
370          .addReg(AMDGPU::EXEC)
371          .addReg(Reg);
372
373  MI.eraseFromParent();
374}
375
376void SILowerControlFlow::Branch(MachineInstr &MI) {
377  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
378  if (MBB == MI.getParent()->getNextNode())
379    MI.eraseFromParent();
380
381  // If these aren't equal, this is probably an infinite loop.
382}
383
384void SILowerControlFlow::Kill(MachineInstr &MI) {
385  MachineBasicBlock &MBB = *MI.getParent();
386  DebugLoc DL = MI.getDebugLoc();
387  const MachineOperand &Op = MI.getOperand(0);
388
389#ifndef NDEBUG
390  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
391  // Kill is only allowed in pixel / geometry shaders.
392  assert(CallConv == CallingConv::AMDGPU_PS ||
393         CallConv == CallingConv::AMDGPU_GS);
394#endif
395
396  // Clear this thread from the exec mask if the operand is negative
397  if ((Op.isImm())) {
398    // Constant operand: Set exec mask to 0 or do nothing
399    if (Op.getImm() & 0x80000000) {
400      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
401              .addImm(0);
402    }
403  } else {
404    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
405           .addImm(0)
406           .addOperand(Op);
407  }
408
409  MI.eraseFromParent();
410}
411
412// All currently live registers must remain so in the remainder block.
413void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
414                                                 const MachineRegisterInfo &MRI,
415                                                 const MachineInstr &MI,
416                                                 MachineBasicBlock &LoopBB,
417                                                 MachineBasicBlock &RemainderBB,
418                                                 unsigned SaveReg,
419                                                 const MachineOperand &IdxReg) {
420  // Add reg defined in loop body.
421  RemainderLiveRegs.addReg(SaveReg);
422
423  if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
424    if (!Val->isUndef()) {
425      RemainderLiveRegs.addReg(Val->getReg());
426      LoopBB.addLiveIn(Val->getReg());
427    }
428  }
429
430  for (unsigned Reg : RemainderLiveRegs) {
431    if (MRI.isAllocatable(Reg))
432      RemainderBB.addLiveIn(Reg);
433  }
434
435  const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
436  if (!Src->isUndef())
437    LoopBB.addLiveIn(Src->getReg());
438
439  if (!IdxReg.isUndef())
440    LoopBB.addLiveIn(IdxReg.getReg());
441  LoopBB.sortUniqueLiveIns();
442}
443
444void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
445                                                DebugLoc DL,
446                                                MachineInstr *MovRel,
447                                                const MachineOperand &IdxReg,
448                                                int Offset) {
449  MachineBasicBlock::iterator I = LoopBB.begin();
450
451  // Read the next variant into VCC (lower 32 bits) <- also loop target
452  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
453    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
454
455  // Move index from VCC into M0
456  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
457    .addReg(AMDGPU::VCC_LO);
458
459  // Compare the just read M0 value to all possible Idx values
460  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
461    .addReg(AMDGPU::M0)
462    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
463
464  // Update EXEC, save the original EXEC value to VCC
465  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
466    .addReg(AMDGPU::VCC);
467
468  if (Offset != 0) {
469    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
470      .addReg(AMDGPU::M0)
471      .addImm(Offset);
472  }
473
474  // Do the actual move
475  LoopBB.insert(I, MovRel);
476
477  // Update EXEC, switch all done bits to 0 and all todo bits to 1
478  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
479    .addReg(AMDGPU::EXEC)
480    .addReg(AMDGPU::VCC);
481
482  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
483  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
484    .addMBB(&LoopBB);
485}
486
487MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
488  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
489  MachineFunction *MF = MBB.getParent();
490
491  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
492  MachineFunction::iterator MBBI(MBB);
493  ++MBBI;
494
495  MF->insert(MBBI, SkipBB);
496  MBB.addSuccessor(SkipBB);
497
498  return SkipBB;
499}
500
501std::pair<MachineBasicBlock *, MachineBasicBlock *>
502SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
503                               MachineBasicBlock::iterator I) {
504  MachineFunction *MF = MBB.getParent();
505
506  // To insert the loop we need to split the block. Move everything after this
507  // point to a new block, and insert a new empty block between the two.
508  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
509  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
510  MachineFunction::iterator MBBI(MBB);
511  ++MBBI;
512
513  MF->insert(MBBI, LoopBB);
514  MF->insert(MBBI, RemainderBB);
515
516  // Move the rest of the block into a new block.
517  RemainderBB->transferSuccessors(&MBB);
518  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
519
520  MBB.addSuccessor(LoopBB);
521
522  return std::make_pair(LoopBB, RemainderBB);
523}
524
525// Returns true if a new block was inserted.
526bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
527  MachineBasicBlock &MBB = *MI.getParent();
528  DebugLoc DL = MI.getDebugLoc();
529  MachineBasicBlock::iterator I(&MI);
530
531  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
532
533  if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
534    if (Offset != 0) {
535      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
536        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
537        .addImm(Offset);
538    } else {
539      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
540        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
541    }
542
543    MBB.insert(I, MovRel);
544    MI.eraseFromParent();
545    return false;
546  }
547
548  MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
549  SaveOp->setIsDead(false);
550  unsigned Save = SaveOp->getReg();
551
552  // Reading from a VGPR requires looping over all workitems in the wavefront.
553  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
554         AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
555
556  // Save the EXEC mask
557  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
558    .addReg(AMDGPU::EXEC);
559
560  LivePhysRegs RemainderLiveRegs(TRI);
561
562  RemainderLiveRegs.addLiveOuts(MBB);
563
564  MachineBasicBlock *LoopBB;
565  MachineBasicBlock *RemainderBB;
566
567  std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
568
569  for (const MachineInstr &Inst : reverse(*RemainderBB))
570    RemainderLiveRegs.stepBackward(Inst);
571
572  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
573  LoopBB->addSuccessor(RemainderBB);
574  LoopBB->addSuccessor(LoopBB);
575
576  splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
577                          *RemainderBB, Save, *Idx);
578
579  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
580
581  MachineBasicBlock::iterator First = RemainderBB->begin();
582  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
583    .addReg(Save);
584
585  MI.eraseFromParent();
586  return true;
587}
588
589/// \param @VecReg The register which holds element zero of the vector being
590///                 addressed into.
591//
592/// \param[in] @Idx The index operand from the movrel instruction. This must be
593// a register, but may be NoRegister.
594///
595/// \param[in] @Offset As an input, this is the constant offset part of the
596// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
597// value that needs to be added to the value stored in M0.
598std::pair<unsigned, int>
599SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
600  unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
601  if (!SubReg)
602    SubReg = VecReg;
603
604  const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
605  const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
606  int NumElts = SuperRC->getSize() / RC->getSize();
607
608  int BaseRegIdx = TRI->getHWRegIndex(SubReg);
609
610  // Skip out of bounds offsets, or else we would end up using an undefined
611  // register.
612  if (Offset >= NumElts)
613    return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
614
615  int RegIdx = BaseRegIdx + Offset;
616  if (RegIdx < 0) {
617    Offset = RegIdx;
618    RegIdx = 0;
619  } else {
620    Offset = 0;
621  }
622
623  unsigned Reg = RC->getRegister(RegIdx);
624  return std::make_pair(Reg, Offset);
625}
626
627// Return true if a new block was inserted.
628bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
629  MachineBasicBlock &MBB = *MI.getParent();
630  const DebugLoc &DL = MI.getDebugLoc();
631
632  unsigned Dst = MI.getOperand(0).getReg();
633  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
634  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
635  unsigned Reg;
636
637  std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
638
639  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
640  if (Idx->getReg() == AMDGPU::NoRegister) {
641    // Only had a constant offset, copy the register directly.
642    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
643      .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
644    MI.eraseFromParent();
645    return false;
646  }
647
648  MachineInstr *MovRel =
649    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
650    .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
651    .addReg(SrcVec->getReg(), RegState::Implicit);
652
653  return loadM0(MI, MovRel, Offset);
654}
655
656// Return true if a new block was inserted.
657bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
658  MachineBasicBlock &MBB = *MI.getParent();
659  const DebugLoc &DL = MI.getDebugLoc();
660
661  unsigned Dst = MI.getOperand(0).getReg();
662  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
663  unsigned Reg;
664
665  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
666  std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
667
668  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
669  if (Idx->getReg() == AMDGPU::NoRegister) {
670    // Only had a constant offset, copy the register directly.
671    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
672      .addOperand(*Val);
673    MI.eraseFromParent();
674    return false;
675  }
676
677  MachineInstr *MovRel =
678    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
679    .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
680    .addReg(Dst, RegState::Implicit);
681
682  return loadM0(MI, MovRel, Offset);
683}
684
685bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
686  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
687  TII = ST.getInstrInfo();
688  TRI = &TII->getRegisterInfo();
689
690  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
691
692  bool HaveKill = false;
693  bool NeedFlat = false;
694  unsigned Depth = 0;
695
696  MachineFunction::iterator NextBB;
697
698  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
699       BI != BE; BI = NextBB) {
700    NextBB = std::next(BI);
701    MachineBasicBlock &MBB = *BI;
702
703    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
704    MachineBasicBlock::iterator I, Next;
705    bool ExecModified = false;
706
707    for (I = MBB.begin(); I != MBB.end(); I = Next) {
708      Next = std::next(I);
709
710      MachineInstr &MI = *I;
711
712      // Flat uses m0 in case it needs to access LDS.
713      if (TII->isFLAT(MI))
714        NeedFlat = true;
715
716      if (I->modifiesRegister(AMDGPU::EXEC, TRI))
717        ExecModified = true;
718
719      switch (MI.getOpcode()) {
720        default: break;
721        case AMDGPU::SI_IF:
722          ++Depth;
723          If(MI);
724          break;
725
726        case AMDGPU::SI_ELSE:
727          Else(MI, ExecModified);
728          break;
729
730        case AMDGPU::SI_BREAK:
731          Break(MI);
732          break;
733
734        case AMDGPU::SI_IF_BREAK:
735          IfBreak(MI);
736          break;
737
738        case AMDGPU::SI_ELSE_BREAK:
739          ElseBreak(MI);
740          break;
741
742        case AMDGPU::SI_LOOP:
743          ++Depth;
744          Loop(MI);
745          break;
746
747        case AMDGPU::SI_END_CF:
748          if (--Depth == 0 && HaveKill) {
749            HaveKill = false;
750
751            if (skipIfDead(MI, *NextBB)) {
752              NextBB = std::next(BI);
753              BE = MF.end();
754              Next = MBB.end();
755            }
756          }
757          EndCf(MI);
758          break;
759
760        case AMDGPU::SI_KILL_TERMINATOR:
761          if (Depth == 0) {
762            if (skipIfDead(MI, *NextBB)) {
763              NextBB = std::next(BI);
764              BE = MF.end();
765              Next = MBB.end();
766            }
767          } else
768            HaveKill = true;
769          Kill(MI);
770          break;
771
772        case AMDGPU::S_BRANCH:
773          Branch(MI);
774          break;
775
776        case AMDGPU::SI_INDIRECT_SRC_V1:
777        case AMDGPU::SI_INDIRECT_SRC_V2:
778        case AMDGPU::SI_INDIRECT_SRC_V4:
779        case AMDGPU::SI_INDIRECT_SRC_V8:
780        case AMDGPU::SI_INDIRECT_SRC_V16:
781          if (indirectSrc(MI)) {
782            // The block was split at this point. We can safely skip the middle
783            // inserted block to the following which contains the rest of this
784            // block's instructions.
785            NextBB = std::next(BI);
786            BE = MF.end();
787            Next = MBB.end();
788          }
789
790          break;
791
792        case AMDGPU::SI_INDIRECT_DST_V1:
793        case AMDGPU::SI_INDIRECT_DST_V2:
794        case AMDGPU::SI_INDIRECT_DST_V4:
795        case AMDGPU::SI_INDIRECT_DST_V8:
796        case AMDGPU::SI_INDIRECT_DST_V16:
797          if (indirectDst(MI)) {
798            // The block was split at this point. We can safely skip the middle
799            // inserted block to the following which contains the rest of this
800            // block's instructions.
801            NextBB = std::next(BI);
802            BE = MF.end();
803            Next = MBB.end();
804          }
805
806          break;
807
808        case AMDGPU::SI_RETURN: {
809          assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
810
811          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
812          // because external bytecode will be appended at the end.
813          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
814            // SI_RETURN is not the last instruction. Add an empty block at
815            // the end and jump there.
816            if (!EmptyMBBAtEnd) {
817              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
818              MF.insert(MF.end(), EmptyMBBAtEnd);
819            }
820
821            MBB.addSuccessor(EmptyMBBAtEnd);
822            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
823                    .addMBB(EmptyMBBAtEnd);
824            I->eraseFromParent();
825          }
826          break;
827        }
828      }
829    }
830  }
831
832  if (NeedFlat && MFI->IsKernel) {
833    // TODO: What to use with function calls?
834    // We will need to Initialize the flat scratch register pair.
835    if (NeedFlat)
836      MFI->setHasFlatInstructions(true);
837  }
838
839  return true;
840}
841