AMDILCFGStructurizer.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8/// \file
9//==-----------------------------------------------------------------------===//
10
11#define DEBUG_TYPE "structcfg"
12
13#include "AMDGPU.h"
14#include "AMDGPUInstrInfo.h"
15#include "R600InstrInfo.h"
16#include "llvm/ADT/DepthFirstIterator.h"
17#include "llvm/ADT/SCCIterator.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineDominators.h"
21#include "llvm/CodeGen/MachineFunction.h"
22#include "llvm/CodeGen/MachineFunctionAnalysis.h"
23#include "llvm/CodeGen/MachineFunctionPass.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineJumpTableInfo.h"
26#include "llvm/CodeGen/MachineLoopInfo.h"
27#include "llvm/CodeGen/MachinePostDominators.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/IR/Dominators.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/raw_ostream.h"
32#include "llvm/Target/TargetInstrInfo.h"
33#include "llvm/Target/TargetMachine.h"
34
35using namespace llvm;
36
37#define DEFAULT_VEC_SLOTS 8
38
39// TODO: move-begin.
40
41//===----------------------------------------------------------------------===//
42//
43// Statistics for CFGStructurizer.
44//
45//===----------------------------------------------------------------------===//
46
47STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
48    "matched");
49STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
50    "matched");
51STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
52    "pattern matched");
53STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
54STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
55
56namespace llvm {
57  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
58}
59
60//===----------------------------------------------------------------------===//
61//
62// Miscellaneous utility for CFGStructurizer.
63//
64//===----------------------------------------------------------------------===//
65namespace {
66#define SHOWNEWINSTR(i) \
67  DEBUG(dbgs() << "New instr: " << *i << "\n");
68
69#define SHOWNEWBLK(b, msg) \
70DEBUG( \
71  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
72  dbgs() << "\n"; \
73);
74
75#define SHOWBLK_DETAIL(b, msg) \
76DEBUG( \
77  if (b) { \
78  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
79  b->print(dbgs()); \
80  dbgs() << "\n"; \
81  } \
82);
83
84#define INVALIDSCCNUM -1
85
86template<class NodeT>
87void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
88  size_t sz = Src.size();
89  for (size_t i = 0; i < sz/2; ++i) {
90    NodeT *t = Src[i];
91    Src[i] = Src[sz - i - 1];
92    Src[sz - i - 1] = t;
93  }
94}
95
96} // end anonymous namespace
97
98//===----------------------------------------------------------------------===//
99//
100// supporting data structure for CFGStructurizer
101//
102//===----------------------------------------------------------------------===//
103
104
105namespace {
106
107class BlockInformation {
108public:
109  bool IsRetired;
110  int  SccNum;
111  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
112};
113
114} // end anonymous namespace
115
116//===----------------------------------------------------------------------===//
117//
118// CFGStructurizer
119//
120//===----------------------------------------------------------------------===//
121
122namespace {
123class AMDGPUCFGStructurizer : public MachineFunctionPass {
124public:
125  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
126  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
127  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
128
129  enum PathToKind {
130    Not_SinglePath = 0,
131    SinglePath_InPath = 1,
132    SinglePath_NotInPath = 2
133  };
134
135  static char ID;
136
137  AMDGPUCFGStructurizer() :
138      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
139    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
140  }
141
142   const char *getPassName() const {
143    return "AMDGPU Control Flow Graph structurizer Pass";
144  }
145
146  void getAnalysisUsage(AnalysisUsage &AU) const {
147    AU.addPreserved<MachineFunctionAnalysis>();
148    AU.addRequired<MachineFunctionAnalysis>();
149    AU.addRequired<MachineDominatorTree>();
150    AU.addRequired<MachinePostDominatorTree>();
151    AU.addRequired<MachineLoopInfo>();
152  }
153
154  /// Perform the CFG structurization
155  bool run();
156
157  /// Perform the CFG preparation
158  /// This step will remove every unconditionnal/dead jump instructions and make
159  /// sure all loops have an exit block
160  bool prepare();
161
162  bool runOnMachineFunction(MachineFunction &MF) {
163    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
164    TRI = &TII->getRegisterInfo();
165    DEBUG(MF.dump(););
166    OrderedBlks.clear();
167    FuncRep = &MF;
168    MLI = &getAnalysis<MachineLoopInfo>();
169    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
170    MDT = &getAnalysis<MachineDominatorTree>();
171    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
172    PDT = &getAnalysis<MachinePostDominatorTree>();
173    DEBUG(PDT->print(dbgs()););
174    prepare();
175    run();
176    DEBUG(MF.dump(););
177    return true;
178  }
179
180protected:
181  MachineDominatorTree *MDT;
182  MachinePostDominatorTree *PDT;
183  MachineLoopInfo *MLI;
184  const R600InstrInfo *TII;
185  const AMDGPURegisterInfo *TRI;
186
187  // PRINT FUNCTIONS
188  /// Print the ordered Blocks.
189  void printOrderedBlocks() const {
190    size_t i = 0;
191    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
192        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
193      dbgs() << "BB" << (*iterBlk)->getNumber();
194      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
195      if (i != 0 && i % 10 == 0) {
196        dbgs() << "\n";
197      } else {
198        dbgs() << " ";
199      }
200    }
201  }
202  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
203    for (MachineLoop::iterator iter = LoopInfo.begin(),
204         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
205      (*iter)->print(dbgs(), 0);
206    }
207  }
208
209  // UTILITY FUNCTIONS
210  int getSCCNum(MachineBasicBlock *MBB) const;
211  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
212  bool hasBackEdge(MachineBasicBlock *MBB) const;
213  static unsigned getLoopDepth(MachineLoop *LoopRep);
214  bool isRetiredBlock(MachineBasicBlock *MBB) const;
215  bool isActiveLoophead(MachineBasicBlock *MBB) const;
216  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
217      bool AllowSideEntry = true) const;
218  int countActiveBlock(MBBVector::const_iterator It,
219      MBBVector::const_iterator E) const;
220  bool needMigrateBlock(MachineBasicBlock *MBB) const;
221
222  // Utility Functions
223  void reversePredicateSetter(MachineBasicBlock::iterator I);
224  /// Compute the reversed DFS post order of Blocks
225  void orderBlocks(MachineFunction *MF);
226
227  // Function originally from CFGStructTraits
228  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
229      DebugLoc DL = DebugLoc());
230  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
231    DebugLoc DL = DebugLoc());
232  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
233  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
234      DebugLoc DL);
235  void insertCondBranchBefore(MachineBasicBlock *MBB,
236      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
237      DebugLoc DL);
238  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
239  static int getBranchNzeroOpcode(int OldOpcode);
240  static int getBranchZeroOpcode(int OldOpcode);
241  static int getContinueNzeroOpcode(int OldOpcode);
242  static int getContinueZeroOpcode(int OldOpcode);
243  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
244  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
245  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
246      MachineInstr *MI);
247  static bool isCondBranch(MachineInstr *MI);
248  static bool isUncondBranch(MachineInstr *MI);
249  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
250  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
251  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
252  ///
253  /// BB with backward-edge could have move instructions after the branch
254  /// instruction.  Such move instruction "belong to" the loop backward-edge.
255  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
256  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
257  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
258  static bool isReturnBlock(MachineBasicBlock *MBB);
259  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
260      MachineBasicBlock *SrcMBB) ;
261  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
262  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
263  /// because the AMDGPU instruction is not recognized as terminator fix this
264  /// and retire this routine
265  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
266      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
267  static void wrapup(MachineBasicBlock *MBB);
268
269
270  int patternMatch(MachineBasicBlock *MBB);
271  int patternMatchGroup(MachineBasicBlock *MBB);
272  int serialPatternMatch(MachineBasicBlock *MBB);
273  int ifPatternMatch(MachineBasicBlock *MBB);
274  int loopendPatternMatch();
275  int mergeLoop(MachineLoop *LoopRep);
276  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
277
278  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
279      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
280      MachineLoop *ContLoop);
281  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
282  /// the same loop with LoopLandInfo without explicitly keeping track of
283  /// loopContBlks and loopBreakBlks, this is a method to get the information.
284  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
285      MachineBasicBlock *Src2MBB);
286  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
287      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
288  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
289      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
290  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
291      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
292      MachineBasicBlock **LandMBBPtr);
293  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
294      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
295      MachineBasicBlock *LandMBB, bool Detail = false);
296  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
297      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
298  void mergeSerialBlock(MachineBasicBlock *DstMBB,
299      MachineBasicBlock *SrcMBB);
300
301  void mergeIfthenelseBlock(MachineInstr *BranchMI,
302      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
303      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
304  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
305      MachineBasicBlock *LandMBB);
306  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
307      MachineBasicBlock *LandMBB);
308  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
309      MachineBasicBlock *ContMBB);
310  /// normalizeInfiniteLoopExit change
311  ///   B1:
312  ///        uncond_br LoopHeader
313  ///
314  /// to
315  ///   B1:
316  ///        cond_br 1 LoopHeader dummyExit
317  /// and return the newly added dummy exit block
318  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
319  void removeUnconditionalBranch(MachineBasicBlock *MBB);
320  /// Remove duplicate branches instructions in a block.
321  /// For instance
322  /// B0:
323  ///    cond_br X B1 B2
324  ///    cond_br X B1 B2
325  /// is transformed to
326  /// B0:
327  ///    cond_br X B1 B2
328  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
329  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
330  void removeSuccessor(MachineBasicBlock *MBB);
331  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
332      MachineBasicBlock *PredMBB);
333  void migrateInstruction(MachineBasicBlock *SrcMBB,
334      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
335  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
336  void retireBlock(MachineBasicBlock *MBB);
337  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
338
339  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
340  /// This is work around solution for findNearestCommonDominator not avaiable
341  /// to post dom a proper fix should go to Dominators.h.
342  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
343      MachineBasicBlock *MBB2);
344
345private:
346  MBBInfoMap BlockInfoMap;
347  LoopLandInfoMap LLInfoMap;
348  std::map<MachineLoop *, bool> Visited;
349  MachineFunction *FuncRep;
350  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
351};
352
353int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
354  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
355  if (It == BlockInfoMap.end())
356    return INVALIDSCCNUM;
357  return (*It).second->SccNum;
358}
359
360MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
361    const {
362  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
363  if (It == LLInfoMap.end())
364    return NULL;
365  return (*It).second;
366}
367
368bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
369  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
370  if (!LoopRep)
371    return false;
372  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
373  return MBB->isSuccessor(LoopHeader);
374}
375
376unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
377  return LoopRep ? LoopRep->getLoopDepth() : 0;
378}
379
380bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
381  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
382  if (It == BlockInfoMap.end())
383    return false;
384  return (*It).second->IsRetired;
385}
386
387bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
388  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
389  while (LoopRep && LoopRep->getHeader() == MBB) {
390    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
391    if(!LoopLand)
392      return true;
393    if (!isRetiredBlock(LoopLand))
394      return true;
395    LoopRep = LoopRep->getParentLoop();
396  }
397  return false;
398}
399AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
400    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
401    bool AllowSideEntry) const {
402  assert(DstMBB);
403  if (SrcMBB == DstMBB)
404    return SinglePath_InPath;
405  while (SrcMBB && SrcMBB->succ_size() == 1) {
406    SrcMBB = *SrcMBB->succ_begin();
407    if (SrcMBB == DstMBB)
408      return SinglePath_InPath;
409    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
410      return Not_SinglePath;
411  }
412  if (SrcMBB && SrcMBB->succ_size()==0)
413    return SinglePath_NotInPath;
414  return Not_SinglePath;
415}
416
417int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
418    MBBVector::const_iterator E) const {
419  int Count = 0;
420  while (It != E) {
421    if (!isRetiredBlock(*It))
422      ++Count;
423    ++It;
424  }
425  return Count;
426}
427
428bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
429  unsigned BlockSizeThreshold = 30;
430  unsigned CloneInstrThreshold = 100;
431  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
432
433  if(!MultiplePreds)
434    return false;
435  unsigned BlkSize = MBB->size();
436  return ((BlkSize > BlockSizeThreshold) &&
437      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
438}
439
440void AMDGPUCFGStructurizer::reversePredicateSetter(
441    MachineBasicBlock::iterator I) {
442  while (I--) {
443    if (I->getOpcode() == AMDGPU::PRED_X) {
444      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
445      case OPCODE_IS_ZERO_INT:
446        static_cast<MachineInstr *>(I)->getOperand(2)
447            .setImm(OPCODE_IS_NOT_ZERO_INT);
448        return;
449      case OPCODE_IS_NOT_ZERO_INT:
450        static_cast<MachineInstr *>(I)->getOperand(2)
451            .setImm(OPCODE_IS_ZERO_INT);
452        return;
453      case OPCODE_IS_ZERO:
454        static_cast<MachineInstr *>(I)->getOperand(2)
455            .setImm(OPCODE_IS_NOT_ZERO);
456        return;
457      case OPCODE_IS_NOT_ZERO:
458        static_cast<MachineInstr *>(I)->getOperand(2)
459            .setImm(OPCODE_IS_ZERO);
460        return;
461      default:
462        llvm_unreachable("PRED_X Opcode invalid!");
463      }
464    }
465  }
466}
467
468void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
469    int NewOpcode, DebugLoc DL) {
470 MachineInstr *MI = MBB->getParent()
471    ->CreateMachineInstr(TII->get(NewOpcode), DL);
472  MBB->push_back(MI);
473  //assume the instruction doesn't take any reg operand ...
474  SHOWNEWINSTR(MI);
475}
476
477MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
478    int NewOpcode, DebugLoc DL) {
479  MachineInstr *MI =
480      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
481  if (MBB->begin() != MBB->end())
482    MBB->insert(MBB->begin(), MI);
483  else
484    MBB->push_back(MI);
485  SHOWNEWINSTR(MI);
486  return MI;
487}
488
489MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
490    MachineBasicBlock::iterator I, int NewOpcode) {
491  MachineInstr *OldMI = &(*I);
492  MachineBasicBlock *MBB = OldMI->getParent();
493  MachineInstr *NewMBB =
494      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
495  MBB->insert(I, NewMBB);
496  //assume the instruction doesn't take any reg operand ...
497  SHOWNEWINSTR(NewMBB);
498  return NewMBB;
499}
500
501void AMDGPUCFGStructurizer::insertCondBranchBefore(
502    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
503  MachineInstr *OldMI = &(*I);
504  MachineBasicBlock *MBB = OldMI->getParent();
505  MachineFunction *MF = MBB->getParent();
506  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
507  MBB->insert(I, NewMI);
508  MachineInstrBuilder MIB(*MF, NewMI);
509  MIB.addReg(OldMI->getOperand(1).getReg(), false);
510  SHOWNEWINSTR(NewMI);
511  //erase later oldInstr->eraseFromParent();
512}
513
514void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
515    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
516    DebugLoc DL) {
517  MachineFunction *MF = blk->getParent();
518  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
519  //insert before
520  blk->insert(I, NewInstr);
521  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
522  SHOWNEWINSTR(NewInstr);
523}
524
525void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
526    int NewOpcode, int RegNum) {
527  MachineFunction *MF = MBB->getParent();
528  MachineInstr *NewInstr =
529    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
530  MBB->push_back(NewInstr);
531  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
532  SHOWNEWINSTR(NewInstr);
533}
534
535int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
536  switch(OldOpcode) {
537  case AMDGPU::JUMP_COND:
538  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
539  case AMDGPU::BRANCH_COND_i32:
540  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
541  default: llvm_unreachable("internal error");
542  }
543  return -1;
544}
545
546int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
547  switch(OldOpcode) {
548  case AMDGPU::JUMP_COND:
549  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
550  case AMDGPU::BRANCH_COND_i32:
551  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
552  default: llvm_unreachable("internal error");
553  }
554  return -1;
555}
556
557int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
558  switch(OldOpcode) {
559  case AMDGPU::JUMP_COND:
560  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
561  default: llvm_unreachable("internal error");
562  };
563  return -1;
564}
565
566int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
567  switch(OldOpcode) {
568  case AMDGPU::JUMP_COND:
569  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
570  default: llvm_unreachable("internal error");
571  }
572  return -1;
573}
574
575MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
576  return MI->getOperand(0).getMBB();
577}
578
579void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
580    MachineBasicBlock *MBB) {
581  MI->getOperand(0).setMBB(MBB);
582}
583
584MachineBasicBlock *
585AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
586    MachineInstr *MI) {
587  assert(MBB->succ_size() == 2);
588  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
589  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
590  MachineBasicBlock::succ_iterator Next = It;
591  ++Next;
592  return (*It == TrueBranch) ? *Next : *It;
593}
594
595bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
596  switch (MI->getOpcode()) {
597    case AMDGPU::JUMP_COND:
598    case AMDGPU::BRANCH_COND_i32:
599    case AMDGPU::BRANCH_COND_f32: return true;
600  default:
601    return false;
602  }
603  return false;
604}
605
606bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
607  switch (MI->getOpcode()) {
608  case AMDGPU::JUMP:
609  case AMDGPU::BRANCH:
610    return true;
611  default:
612    return false;
613  }
614  return false;
615}
616
617DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
618  //get DebugLoc from the first MachineBasicBlock instruction with debug info
619  DebugLoc DL;
620  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
621      ++It) {
622    MachineInstr *instr = &(*It);
623    if (instr->getDebugLoc().isUnknown() == false)
624      DL = instr->getDebugLoc();
625  }
626  return DL;
627}
628
629MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
630    MachineBasicBlock *MBB) {
631  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
632  MachineInstr *MI = &*It;
633  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
634    return MI;
635  return NULL;
636}
637
638MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
639    MachineBasicBlock *MBB) {
640  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
641      It != E; ++It) {
642    // FIXME: Simplify
643    MachineInstr *MI = &*It;
644    if (MI) {
645      if (isCondBranch(MI) || isUncondBranch(MI))
646        return MI;
647      else if (!TII->isMov(MI->getOpcode()))
648        break;
649    }
650  }
651  return NULL;
652}
653
654MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
655  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
656  if (It != MBB->rend()) {
657    MachineInstr *instr = &(*It);
658    if (instr->getOpcode() == AMDGPU::RETURN)
659      return instr;
660  }
661  return NULL;
662}
663
664MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
665  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
666  if (It != MBB->rend()) {
667    MachineInstr *MI = &(*It);
668    if (MI->getOpcode() == AMDGPU::CONTINUE)
669      return MI;
670  }
671  return NULL;
672}
673
674bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
675  MachineInstr *MI = getReturnInstr(MBB);
676  bool IsReturn = (MBB->succ_size() == 0);
677  if (MI)
678    assert(IsReturn);
679  else if (IsReturn)
680    DEBUG(
681      dbgs() << "BB" << MBB->getNumber()
682             <<" is return block without RETURN instr\n";);
683  return  IsReturn;
684}
685
686void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
687    MachineBasicBlock *SrcMBB) {
688  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
689       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
690    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
691}
692
693MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
694  MachineFunction *Func = MBB->getParent();
695  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
696  Func->push_back(NewMBB);  //insert to function
697  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
698      It != E; ++It) {
699    MachineInstr *MI = Func->CloneMachineInstr(It);
700    NewMBB->push_back(MI);
701  }
702  return NewMBB;
703}
704
705void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
706    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
707    MachineBasicBlock *NewBlk) {
708  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
709  if (BranchMI && isCondBranch(BranchMI) &&
710      getTrueBranch(BranchMI) == OldMBB)
711    setTrueBranch(BranchMI, NewBlk);
712}
713
714void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
715  assert((!MBB->getParent()->getJumpTableInfo()
716          || MBB->getParent()->getJumpTableInfo()->isEmpty())
717         && "found a jump table");
718
719   //collect continue right before endloop
720   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
721   MachineBasicBlock::iterator Pre = MBB->begin();
722   MachineBasicBlock::iterator E = MBB->end();
723   MachineBasicBlock::iterator It = Pre;
724   while (It != E) {
725     if (Pre->getOpcode() == AMDGPU::CONTINUE
726         && It->getOpcode() == AMDGPU::ENDLOOP)
727       ContInstr.push_back(Pre);
728     Pre = It;
729     ++It;
730   }
731
732   //delete continue right before endloop
733   for (unsigned i = 0; i < ContInstr.size(); ++i)
734      ContInstr[i]->eraseFromParent();
735
736   // TODO to fix up jump table so later phase won't be confused.  if
737   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
738   // there isn't such an interface yet.  alternatively, replace all the other
739   // blocks in the jump table with the entryBlk //}
740
741}
742
743
744bool AMDGPUCFGStructurizer::prepare() {
745  bool Changed = false;
746
747  //FIXME: if not reducible flow graph, make it so ???
748
749  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
750
751  orderBlocks(FuncRep);
752
753  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
754
755  // Add an ExitBlk to loop that don't have one
756  for (MachineLoopInfo::iterator It = MLI->begin(),
757       E = MLI->end(); It != E; ++It) {
758    MachineLoop *LoopRep = (*It);
759    MBBVector ExitingMBBs;
760    LoopRep->getExitingBlocks(ExitingMBBs);
761
762    if (ExitingMBBs.size() == 0) {
763      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
764      if (DummyExitBlk)
765        RetBlks.push_back(DummyExitBlk);
766    }
767  }
768
769  // Remove unconditional branch instr.
770  // Add dummy exit block iff there are multiple returns.
771  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
772       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
773    MachineBasicBlock *MBB = *It;
774    removeUnconditionalBranch(MBB);
775    removeRedundantConditionalBranch(MBB);
776    if (isReturnBlock(MBB)) {
777      RetBlks.push_back(MBB);
778    }
779    assert(MBB->succ_size() <= 2);
780  }
781
782  if (RetBlks.size() >= 2) {
783    addDummyExitBlock(RetBlks);
784    Changed = true;
785  }
786
787  return Changed;
788}
789
790bool AMDGPUCFGStructurizer::run() {
791
792  //Assume reducible CFG...
793  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
794
795#ifdef STRESSTEST
796  //Use the worse block ordering to test the algorithm.
797  ReverseVector(orderedBlks);
798#endif
799
800  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
801  int NumIter = 0;
802  bool Finish = false;
803  MachineBasicBlock *MBB;
804  bool MakeProgress = false;
805  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
806                                        OrderedBlks.end());
807
808  do {
809    ++NumIter;
810    DEBUG(
811      dbgs() << "numIter = " << NumIter
812             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
813    );
814
815    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
816        OrderedBlks.begin();
817    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
818        OrderedBlks.end();
819
820    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
821        It;
822    MachineBasicBlock *SccBeginMBB = NULL;
823    int SccNumBlk = 0;  // The number of active blocks, init to a
824                        // maximum possible number.
825    int SccNumIter;     // Number of iteration in this SCC.
826
827    while (It != E) {
828      MBB = *It;
829
830      if (!SccBeginMBB) {
831        SccBeginIter = It;
832        SccBeginMBB = MBB;
833        SccNumIter = 0;
834        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
835        DEBUG(
836              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
837              dbgs() << "\n";
838        );
839      }
840
841      if (!isRetiredBlock(MBB))
842        patternMatch(MBB);
843
844      ++It;
845
846      bool ContNextScc = true;
847      if (It == E
848          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
849        // Just finish one scc.
850        ++SccNumIter;
851        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
852        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
853          DEBUG(
854            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
855                   << ", sccNumIter = " << SccNumIter;
856            dbgs() << "doesn't make any progress\n";
857          );
858          ContNextScc = true;
859        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
860          SccNumBlk = sccRemainedNumBlk;
861          It = SccBeginIter;
862          ContNextScc = false;
863          DEBUG(
864            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
865                   << "sccNumIter = " << SccNumIter << '\n';
866          );
867        } else {
868          // Finish the current scc.
869          ContNextScc = true;
870        }
871      } else {
872        // Continue on next component in the current scc.
873        ContNextScc = false;
874      }
875
876      if (ContNextScc)
877        SccBeginMBB = NULL;
878    } //while, "one iteration" over the function.
879
880    MachineBasicBlock *EntryMBB =
881        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
882    if (EntryMBB->succ_size() == 0) {
883      Finish = true;
884      DEBUG(
885        dbgs() << "Reduce to one block\n";
886      );
887    } else {
888      int NewnumRemainedBlk
889        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
890      // consider cloned blocks ??
891      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
892        MakeProgress = true;
893        NumRemainedBlk = NewnumRemainedBlk;
894      } else {
895        MakeProgress = false;
896        DEBUG(
897          dbgs() << "No progress\n";
898        );
899      }
900    }
901  } while (!Finish && MakeProgress);
902
903  // Misc wrap up to maintain the consistency of the Function representation.
904  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
905
906  // Detach retired Block, release memory.
907  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
908      It != E; ++It) {
909    if ((*It).second && (*It).second->IsRetired) {
910      assert(((*It).first)->getNumber() != -1);
911      DEBUG(
912        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
913      );
914      (*It).first->eraseFromParent();  //Remove from the parent Function.
915    }
916    delete (*It).second;
917  }
918  BlockInfoMap.clear();
919  LLInfoMap.clear();
920
921  if (!Finish) {
922    DEBUG(FuncRep->viewCFG());
923    llvm_unreachable("IRREDUCIBLE_CFG");
924  }
925
926  return true;
927}
928
929
930
931void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
932  int SccNum = 0;
933  MachineBasicBlock *MBB;
934  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
935       ++It, ++SccNum) {
936    std::vector<MachineBasicBlock *> &SccNext = *It;
937    for (std::vector<MachineBasicBlock *>::const_iterator
938         blockIter = SccNext.begin(), blockEnd = SccNext.end();
939         blockIter != blockEnd; ++blockIter) {
940      MBB = *blockIter;
941      OrderedBlks.push_back(MBB);
942      recordSccnum(MBB, SccNum);
943    }
944  }
945
946  //walk through all the block in func to check for unreachable
947  typedef GraphTraits<MachineFunction *> GTM;
948  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
949  for (; It != E; ++It) {
950    MachineBasicBlock *MBB = &(*It);
951    SccNum = getSCCNum(MBB);
952    if (SccNum == INVALIDSCCNUM)
953      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
954  }
955}
956
957int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
958  int NumMatch = 0;
959  int CurMatch;
960
961  DEBUG(
962        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
963  );
964
965  while ((CurMatch = patternMatchGroup(MBB)) > 0)
966    NumMatch += CurMatch;
967
968  DEBUG(
969        dbgs() << "End patternMatch BB" << MBB->getNumber()
970      << ", numMatch = " << NumMatch << "\n";
971  );
972
973  return NumMatch;
974}
975
976int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
977  int NumMatch = 0;
978  NumMatch += loopendPatternMatch();
979  NumMatch += serialPatternMatch(MBB);
980  NumMatch += ifPatternMatch(MBB);
981  return NumMatch;
982}
983
984
985int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
986  if (MBB->succ_size() != 1)
987    return 0;
988
989  MachineBasicBlock *childBlk = *MBB->succ_begin();
990  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
991    return 0;
992
993  mergeSerialBlock(MBB, childBlk);
994  ++numSerialPatternMatch;
995  return 1;
996}
997
998int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
999  //two edges
1000  if (MBB->succ_size() != 2)
1001    return 0;
1002  if (hasBackEdge(MBB))
1003    return 0;
1004  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
1005  if (!BranchMI)
1006    return 0;
1007
1008  assert(isCondBranch(BranchMI));
1009  int NumMatch = 0;
1010
1011  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
1012  NumMatch += serialPatternMatch(TrueMBB);
1013  NumMatch += ifPatternMatch(TrueMBB);
1014  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
1015  NumMatch += serialPatternMatch(FalseMBB);
1016  NumMatch += ifPatternMatch(FalseMBB);
1017  MachineBasicBlock *LandBlk;
1018  int Cloned = 0;
1019
1020  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
1021  // TODO: Simplify
1022  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
1023    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
1024    // Diamond pattern
1025    LandBlk = *TrueMBB->succ_begin();
1026  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
1027    // Triangle pattern, false is empty
1028    LandBlk = FalseMBB;
1029    FalseMBB = NULL;
1030  } else if (FalseMBB->succ_size() == 1
1031             && *FalseMBB->succ_begin() == TrueMBB) {
1032    // Triangle pattern, true is empty
1033    // We reverse the predicate to make a triangle, empty false pattern;
1034    std::swap(TrueMBB, FalseMBB);
1035    reversePredicateSetter(MBB->end());
1036    LandBlk = FalseMBB;
1037    FalseMBB = NULL;
1038  } else if (FalseMBB->succ_size() == 1
1039             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
1040    LandBlk = *FalseMBB->succ_begin();
1041  } else if (TrueMBB->succ_size() == 1
1042    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
1043    LandBlk = *TrueMBB->succ_begin();
1044  } else {
1045    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
1046  }
1047
1048  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
1049  // new BB created for landBlk==NULL may introduce new challenge to the
1050  // reduction process.
1051  if (LandBlk &&
1052      ((TrueMBB && TrueMBB->pred_size() > 1)
1053      || (FalseMBB && FalseMBB->pred_size() > 1))) {
1054     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
1055  }
1056
1057  if (TrueMBB && TrueMBB->pred_size() > 1) {
1058    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
1059    ++Cloned;
1060  }
1061
1062  if (FalseMBB && FalseMBB->pred_size() > 1) {
1063    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
1064    ++Cloned;
1065  }
1066
1067  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
1068
1069  ++numIfPatternMatch;
1070
1071  numClonedBlock += Cloned;
1072
1073  return 1 + Cloned + NumMatch;
1074}
1075
1076int AMDGPUCFGStructurizer::loopendPatternMatch() {
1077  std::vector<MachineLoop *> NestedLoops;
1078  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
1079      It != E; ++It) {
1080    df_iterator<MachineLoop *> LpIt = df_begin(*It),
1081        LpE = df_end(*It);
1082    for (; LpIt != LpE; ++LpIt)
1083      NestedLoops.push_back(*LpIt);
1084  }
1085  if (NestedLoops.size() == 0)
1086    return 0;
1087
1088  // Process nested loop outside->inside, so "continue" to a outside loop won't
1089  // be mistaken as "break" of the current loop.
1090  int Num = 0;
1091  for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(),
1092      E = NestedLoops.rend(); It != E; ++It) {
1093    MachineLoop *ExaminedLoop = *It;
1094    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
1095      continue;
1096    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
1097    int NumBreak = mergeLoop(ExaminedLoop);
1098    if (NumBreak == -1)
1099      break;
1100    Num += NumBreak;
1101  }
1102  return Num;
1103}
1104
1105int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
1106  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
1107  MBBVector ExitingMBBs;
1108  LoopRep->getExitingBlocks(ExitingMBBs);
1109  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
1110  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
1111  // We assume a single ExitBlk
1112  MBBVector ExitBlks;
1113  LoopRep->getExitBlocks(ExitBlks);
1114  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
1115  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
1116    ExitBlkSet.insert(ExitBlks[i]);
1117  assert(ExitBlkSet.size() == 1);
1118  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
1119  assert(ExitBlk && "Loop has several exit block");
1120  MBBVector LatchBlks;
1121  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
1122  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
1123      PE = InvMBBTraits::child_end(LoopHeader);
1124  for (; PI != PE; PI++) {
1125    if (LoopRep->contains(*PI))
1126      LatchBlks.push_back(*PI);
1127  }
1128
1129  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
1130    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
1131  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
1132    settleLoopcontBlock(LatchBlks[i], LoopHeader);
1133  int Match = 0;
1134  do {
1135    Match = 0;
1136    Match += serialPatternMatch(LoopHeader);
1137    Match += ifPatternMatch(LoopHeader);
1138  } while (Match > 0);
1139  mergeLooplandBlock(LoopHeader, ExitBlk);
1140  MachineLoop *ParentLoop = LoopRep->getParentLoop();
1141  if (ParentLoop)
1142    MLI->changeLoopFor(LoopHeader, ParentLoop);
1143  else
1144    MLI->removeBlock(LoopHeader);
1145  Visited[LoopRep] = true;
1146  return 1;
1147}
1148
1149int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
1150    MachineBasicBlock *LoopHeader) {
1151  int NumCont = 0;
1152  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
1153  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
1154  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
1155      E = GTIM::child_end(LoopHeader);
1156  for (; It != E; ++It) {
1157    MachineBasicBlock *MBB = *It;
1158    if (LoopRep->contains(MBB)) {
1159      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
1160                          LoopHeader, LoopRep);
1161      ContMBB.push_back(MBB);
1162      ++NumCont;
1163    }
1164  }
1165
1166  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
1167      E = ContMBB.end(); It != E; ++It) {
1168    (*It)->removeSuccessor(LoopHeader);
1169  }
1170
1171  numLoopcontPatternMatch += NumCont;
1172
1173  return NumCont;
1174}
1175
1176
1177bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
1178    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
1179  if (Src1MBB->succ_size() == 0) {
1180    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
1181    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
1182      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
1183      if (TheEntry) {
1184        DEBUG(
1185          dbgs() << "isLoopContBreakBlock yes src1 = BB"
1186                 << Src1MBB->getNumber()
1187                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
1188        );
1189        return true;
1190      }
1191    }
1192  }
1193  return false;
1194}
1195
1196int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
1197    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
1198  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
1199  if (Num == 0) {
1200    DEBUG(
1201      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
1202    );
1203    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
1204  }
1205  return Num;
1206}
1207
1208int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
1209    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
1210  int Num = 0;
1211  MachineBasicBlock *DownBlk;
1212
1213  //trueBlk could be the common post dominator
1214  DownBlk = TrueMBB;
1215
1216  DEBUG(
1217    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
1218           << " true = BB" << TrueMBB->getNumber()
1219           << ", numSucc=" << TrueMBB->succ_size()
1220           << " false = BB" << FalseMBB->getNumber() << "\n";
1221  );
1222
1223  while (DownBlk) {
1224    DEBUG(
1225      dbgs() << "check down = BB" << DownBlk->getNumber();
1226    );
1227
1228    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
1229      DEBUG(
1230        dbgs() << " working\n";
1231      );
1232
1233      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
1234      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
1235
1236      numClonedBlock += Num;
1237      Num += serialPatternMatch(*HeadMBB->succ_begin());
1238      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
1239      Num += ifPatternMatch(HeadMBB);
1240      assert(Num > 0);
1241
1242      break;
1243    }
1244    DEBUG(
1245      dbgs() << " not working\n";
1246    );
1247    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
1248  } // walk down the postDomTree
1249
1250  return Num;
1251}
1252
1253void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
1254    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
1255    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
1256  dbgs() << "head = BB" << HeadMBB->getNumber()
1257         << " size = " << HeadMBB->size();
1258  if (Detail) {
1259    dbgs() << "\n";
1260    HeadMBB->print(dbgs());
1261    dbgs() << "\n";
1262  }
1263
1264  if (TrueMBB) {
1265    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
1266           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
1267    if (Detail) {
1268      dbgs() << "\n";
1269      TrueMBB->print(dbgs());
1270      dbgs() << "\n";
1271    }
1272  }
1273  if (FalseMBB) {
1274    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
1275           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
1276    if (Detail) {
1277      dbgs() << "\n";
1278      FalseMBB->print(dbgs());
1279      dbgs() << "\n";
1280    }
1281  }
1282  if (LandMBB) {
1283    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
1284           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
1285    if (Detail) {
1286      dbgs() << "\n";
1287      LandMBB->print(dbgs());
1288      dbgs() << "\n";
1289    }
1290  }
1291
1292    dbgs() << "\n";
1293}
1294
1295int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
1296    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
1297    MachineBasicBlock **LandMBBPtr) {
1298  bool MigrateTrue = false;
1299  bool MigrateFalse = false;
1300
1301  MachineBasicBlock *LandBlk = *LandMBBPtr;
1302
1303  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
1304         && (!FalseMBB || FalseMBB->succ_size() <= 1));
1305
1306  if (TrueMBB == FalseMBB)
1307    return 0;
1308
1309  MigrateTrue = needMigrateBlock(TrueMBB);
1310  MigrateFalse = needMigrateBlock(FalseMBB);
1311
1312  if (!MigrateTrue && !MigrateFalse)
1313    return 0;
1314
1315  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
1316  // have more than one predecessors.  without doing this, its predecessor
1317  // rather than headBlk will have undefined value in initReg.
1318  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
1319    MigrateTrue = true;
1320  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
1321    MigrateFalse = true;
1322
1323  DEBUG(
1324    dbgs() << "before improveSimpleJumpintoIf: ";
1325    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
1326  );
1327
1328  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
1329  //
1330  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
1331  //      {initReg = 0; org falseBlk branch }
1332  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
1333  //      => org landBlk
1334  //      if landBlk->pred_size() > 2, put the about if-else inside
1335  //      if (initReg !=2) {...}
1336  //
1337  // add initReg = initVal to headBlk
1338
1339  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
1340  if (!MigrateTrue || !MigrateFalse) {
1341    // XXX: We have an opportunity here to optimize the "branch into if" case
1342    // here.  Branch into if looks like this:
1343    //                        entry
1344    //                       /     |
1345    //           diamond_head       branch_from
1346    //             /      \           |
1347    // diamond_false        diamond_true
1348    //             \      /
1349    //               done
1350    //
1351    // The diamond_head block begins the "if" and the diamond_true block
1352    // is the block being "branched into".
1353    //
1354    // If MigrateTrue is true, then TrueBB is the block being "branched into"
1355    // and if MigrateFalse is true, then FalseBB is the block being
1356    // "branched into"
1357    //
1358    // Here is the pseudo code for how I think the optimization should work:
1359    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
1360    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
1361    // 3. Move the branch instruction from diamond_head into its own basic
1362    //    block (new_block).
1363    // 4. Add an unconditional branch from diamond_head to new_block
1364    // 5. Replace the branch instruction in branch_from with an unconditional
1365    //    branch to new_block.  If branch_from has multiple predecessors, then
1366    //    we need to replace the True/False block in the branch
1367    //    instruction instead of replacing it.
1368    // 6. Change the condition of the branch instruction in new_block from
1369    //    COND to (COND || GPR0)
1370    //
1371    // In order insert these MOV instruction, we will need to use the
1372    // RegisterScavenger.  Usually liveness stops being tracked during
1373    // the late machine optimization passes, however if we implement
1374    // bool TargetRegisterInfo::requiresRegisterScavenging(
1375    //                                                const MachineFunction &MF)
1376    // and have it return true, liveness will be tracked correctly
1377    // by generic optimization passes.  We will also need to make sure that
1378    // all of our target-specific passes that run after regalloc and before
1379    // the CFGStructurizer track liveness and we will need to modify this pass
1380    // to correctly track liveness.
1381    //
1382    // After the above changes, the new CFG should look like this:
1383    //                        entry
1384    //                       /     |
1385    //           diamond_head       branch_from
1386    //                       \     /
1387    //                      new_block
1388    //                      /      |
1389    //         diamond_false        diamond_true
1390    //                      \      /
1391    //                        done
1392    //
1393    // Without this optimization, we are forced to duplicate the diamond_true
1394    // block and we will end up with a CFG like this:
1395    //
1396    //                        entry
1397    //                       /     |
1398    //           diamond_head       branch_from
1399    //             /      \                   |
1400    // diamond_false        diamond_true      diamond_true (duplicate)
1401    //             \      /                   |
1402    //               done --------------------|
1403    //
1404    // Duplicating diamond_true can be very costly especially if it has a
1405    // lot of instructions.
1406    return 0;
1407  }
1408
1409  int NumNewBlk = 0;
1410
1411  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
1412
1413  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
1414  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
1415
1416  if (LandBlkHasOtherPred) {
1417    llvm_unreachable("Extra register needed to handle CFG");
1418    unsigned CmpResReg =
1419      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
1420    llvm_unreachable("Extra compare instruction needed to handle CFG");
1421    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
1422        CmpResReg, DebugLoc());
1423  }
1424
1425  // XXX: We are running this after RA, so creating virtual registers will
1426  // cause an assertion failure in the PostRA scheduling pass.
1427  unsigned InitReg =
1428    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
1429  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
1430      DebugLoc());
1431
1432  if (MigrateTrue) {
1433    migrateInstruction(TrueMBB, LandBlk, I);
1434    // need to uncondionally insert the assignment to ensure a path from its
1435    // predecessor rather than headBlk has valid value in initReg if
1436    // (initVal != 1).
1437    llvm_unreachable("Extra register needed to handle CFG");
1438  }
1439  insertInstrBefore(I, AMDGPU::ELSE);
1440
1441  if (MigrateFalse) {
1442    migrateInstruction(FalseMBB, LandBlk, I);
1443    // need to uncondionally insert the assignment to ensure a path from its
1444    // predecessor rather than headBlk has valid value in initReg if
1445    // (initVal != 0)
1446    llvm_unreachable("Extra register needed to handle CFG");
1447  }
1448
1449  if (LandBlkHasOtherPred) {
1450    // add endif
1451    insertInstrBefore(I, AMDGPU::ENDIF);
1452
1453    // put initReg = 2 to other predecessors of landBlk
1454    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
1455         PE = LandBlk->pred_end(); PI != PE; ++PI) {
1456      MachineBasicBlock *MBB = *PI;
1457      if (MBB != TrueMBB && MBB != FalseMBB)
1458        llvm_unreachable("Extra register needed to handle CFG");
1459    }
1460  }
1461  DEBUG(
1462    dbgs() << "result from improveSimpleJumpintoIf: ";
1463    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
1464  );
1465
1466  // update landBlk
1467  *LandMBBPtr = LandBlk;
1468
1469  return NumNewBlk;
1470}
1471
1472void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
1473    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
1474    MachineLoop *ContLoop) {
1475  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
1476               << " header = BB" << ContMBB->getNumber() << "\n";
1477        dbgs() << "Trying to continue loop-depth = "
1478               << getLoopDepth(ContLoop)
1479               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
1480  settleLoopcontBlock(ContingMBB, ContMBB);
1481}
1482
1483void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
1484    MachineBasicBlock *SrcMBB) {
1485  DEBUG(
1486    dbgs() << "serialPattern BB" << DstMBB->getNumber()
1487           << " <= BB" << SrcMBB->getNumber() << "\n";
1488  );
1489  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
1490
1491  DstMBB->removeSuccessor(SrcMBB);
1492  cloneSuccessorList(DstMBB, SrcMBB);
1493
1494  removeSuccessor(SrcMBB);
1495  MLI->removeBlock(SrcMBB);
1496  retireBlock(SrcMBB);
1497}
1498
1499void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
1500    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
1501    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
1502  assert (TrueMBB);
1503  DEBUG(
1504    dbgs() << "ifPattern BB" << MBB->getNumber();
1505    dbgs() << "{  ";
1506    if (TrueMBB) {
1507      dbgs() << "BB" << TrueMBB->getNumber();
1508    }
1509    dbgs() << "  } else ";
1510    dbgs() << "{  ";
1511    if (FalseMBB) {
1512      dbgs() << "BB" << FalseMBB->getNumber();
1513    }
1514    dbgs() << "  }\n ";
1515    dbgs() << "landBlock: ";
1516    if (!LandMBB) {
1517      dbgs() << "NULL";
1518    } else {
1519      dbgs() << "BB" << LandMBB->getNumber();
1520    }
1521    dbgs() << "\n";
1522  );
1523
1524  int OldOpcode = BranchMI->getOpcode();
1525  DebugLoc BranchDL = BranchMI->getDebugLoc();
1526
1527//    transform to
1528//    if cond
1529//       trueBlk
1530//    else
1531//       falseBlk
1532//    endif
1533//    landBlk
1534
1535  MachineBasicBlock::iterator I = BranchMI;
1536  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
1537      BranchDL);
1538
1539  if (TrueMBB) {
1540    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
1541    MBB->removeSuccessor(TrueMBB);
1542    if (LandMBB && TrueMBB->succ_size()!=0)
1543      TrueMBB->removeSuccessor(LandMBB);
1544    retireBlock(TrueMBB);
1545    MLI->removeBlock(TrueMBB);
1546  }
1547
1548  if (FalseMBB) {
1549    insertInstrBefore(I, AMDGPU::ELSE);
1550    MBB->splice(I, FalseMBB, FalseMBB->begin(),
1551                   FalseMBB->end());
1552    MBB->removeSuccessor(FalseMBB);
1553    if (LandMBB && FalseMBB->succ_size() != 0)
1554      FalseMBB->removeSuccessor(LandMBB);
1555    retireBlock(FalseMBB);
1556    MLI->removeBlock(FalseMBB);
1557  }
1558  insertInstrBefore(I, AMDGPU::ENDIF);
1559
1560  BranchMI->eraseFromParent();
1561
1562  if (LandMBB && TrueMBB && FalseMBB)
1563    MBB->addSuccessor(LandMBB);
1564
1565}
1566
1567void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
1568    MachineBasicBlock *LandMBB) {
1569  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
1570               << " land = BB" << LandMBB->getNumber() << "\n";);
1571
1572  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
1573  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
1574  DstBlk->addSuccessor(LandMBB);
1575  DstBlk->removeSuccessor(DstBlk);
1576}
1577
1578
1579void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
1580    MachineBasicBlock *LandMBB) {
1581  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
1582               << " land = BB" << LandMBB->getNumber() << "\n";);
1583  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
1584  assert(BranchMI && isCondBranch(BranchMI));
1585  DebugLoc DL = BranchMI->getDebugLoc();
1586  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
1587  MachineBasicBlock::iterator I = BranchMI;
1588  if (TrueBranch != LandMBB)
1589    reversePredicateSetter(I);
1590  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
1591  insertInstrBefore(I, AMDGPU::BREAK);
1592  insertInstrBefore(I, AMDGPU::ENDIF);
1593  //now branchInst can be erase safely
1594  BranchMI->eraseFromParent();
1595  //now take care of successors, retire blocks
1596  ExitingMBB->removeSuccessor(LandMBB);
1597}
1598
1599void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
1600    MachineBasicBlock *ContMBB) {
1601  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
1602               << ContingMBB->getNumber()
1603               << ", cont = BB" << ContMBB->getNumber() << "\n";);
1604
1605  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
1606  if (MI) {
1607    assert(isCondBranch(MI));
1608    MachineBasicBlock::iterator I = MI;
1609    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
1610    int OldOpcode = MI->getOpcode();
1611    DebugLoc DL = MI->getDebugLoc();
1612
1613    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
1614
1615    if (UseContinueLogical == false) {
1616      int BranchOpcode =
1617          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
1618          getBranchZeroOpcode(OldOpcode);
1619      insertCondBranchBefore(I, BranchOpcode, DL);
1620      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
1621      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
1622      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
1623    } else {
1624      int BranchOpcode =
1625          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
1626          getContinueZeroOpcode(OldOpcode);
1627      insertCondBranchBefore(I, BranchOpcode, DL);
1628    }
1629
1630    MI->eraseFromParent();
1631  } else {
1632    // if we've arrived here then we've already erased the branch instruction
1633    // travel back up the basic block to see the last reference of our debug
1634    // location we've just inserted that reference here so it should be
1635    // representative insertEnd to ensure phi-moves, if exist, go before the
1636    // continue-instr.
1637    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
1638        getLastDebugLocInBB(ContingMBB));
1639  }
1640}
1641
1642int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
1643    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
1644  int Cloned = 0;
1645  assert(PreMBB->isSuccessor(SrcMBB));
1646  while (SrcMBB && SrcMBB != DstMBB) {
1647    assert(SrcMBB->succ_size() == 1);
1648    if (SrcMBB->pred_size() > 1) {
1649      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
1650      ++Cloned;
1651    }
1652
1653    PreMBB = SrcMBB;
1654    SrcMBB = *SrcMBB->succ_begin();
1655  }
1656
1657  return Cloned;
1658}
1659
1660MachineBasicBlock *
1661AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
1662    MachineBasicBlock *PredMBB) {
1663  assert(PredMBB->isSuccessor(MBB) &&
1664         "succBlk is not a prececessor of curBlk");
1665
1666  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
1667  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
1668  //srcBlk, oldBlk, newBlk
1669
1670  PredMBB->removeSuccessor(MBB);
1671  PredMBB->addSuccessor(CloneMBB);
1672
1673  // add all successor to cloneBlk
1674  cloneSuccessorList(CloneMBB, MBB);
1675
1676  numClonedInstr += MBB->size();
1677
1678  DEBUG(
1679    dbgs() << "Cloned block: " << "BB"
1680           << MBB->getNumber() << "size " << MBB->size() << "\n";
1681  );
1682
1683  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
1684
1685  return CloneMBB;
1686}
1687
1688void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
1689    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
1690  MachineBasicBlock::iterator SpliceEnd;
1691  //look for the input branchinstr, not the AMDGPU branchinstr
1692  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
1693  if (!BranchMI) {
1694    DEBUG(
1695      dbgs() << "migrateInstruction don't see branch instr\n" ;
1696    );
1697    SpliceEnd = SrcMBB->end();
1698  } else {
1699    DEBUG(
1700      dbgs() << "migrateInstruction see branch instr\n" ;
1701      BranchMI->dump();
1702    );
1703    SpliceEnd = BranchMI;
1704  }
1705  DEBUG(
1706    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
1707      << "srcSize = " << SrcMBB->size() << "\n";
1708  );
1709
1710  //splice insert before insertPos
1711  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
1712
1713  DEBUG(
1714    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
1715      << "srcSize = " << SrcMBB->size() << "\n";
1716  );
1717}
1718
1719MachineBasicBlock *
1720AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
1721  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
1722  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
1723  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
1724
1725  if (!LoopHeader || !LoopLatch)
1726    return NULL;
1727  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
1728  // Is LoopRep an infinite loop ?
1729  if (!BranchMI || !isUncondBranch(BranchMI))
1730    return NULL;
1731
1732  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
1733  FuncRep->push_back(DummyExitBlk);  //insert to function
1734  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
1735  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
1736  MachineBasicBlock::iterator I = BranchMI;
1737  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
1738  llvm_unreachable("Extra register needed to handle CFG");
1739  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
1740  MachineInstrBuilder MIB(*FuncRep, NewMI);
1741  MIB.addMBB(LoopHeader);
1742  MIB.addReg(ImmReg, false);
1743  SHOWNEWINSTR(NewMI);
1744  BranchMI->eraseFromParent();
1745  LoopLatch->addSuccessor(DummyExitBlk);
1746
1747  return DummyExitBlk;
1748}
1749
1750void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
1751  MachineInstr *BranchMI;
1752
1753  // I saw two unconditional branch in one basic block in example
1754  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
1755  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
1756          && isUncondBranch(BranchMI)) {
1757    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
1758    BranchMI->eraseFromParent();
1759  }
1760}
1761
1762void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
1763    MachineBasicBlock *MBB) {
1764  if (MBB->succ_size() != 2)
1765    return;
1766  MachineBasicBlock *MBB1 = *MBB->succ_begin();
1767  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
1768  if (MBB1 != MBB2)
1769    return;
1770
1771  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
1772  assert(BranchMI && isCondBranch(BranchMI));
1773  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
1774  BranchMI->eraseFromParent();
1775  SHOWNEWBLK(MBB1, "Removing redundant successor");
1776  MBB->removeSuccessor(MBB1);
1777}
1778
1779void AMDGPUCFGStructurizer::addDummyExitBlock(
1780    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
1781  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
1782  FuncRep->push_back(DummyExitBlk);  //insert to function
1783  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
1784
1785  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
1786       E = RetMBB.end(); It != E; ++It) {
1787    MachineBasicBlock *MBB = *It;
1788    MachineInstr *MI = getReturnInstr(MBB);
1789    if (MI)
1790      MI->eraseFromParent();
1791    MBB->addSuccessor(DummyExitBlk);
1792    DEBUG(
1793      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
1794             << " successors\n";
1795    );
1796  }
1797  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
1798}
1799
1800void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
1801  while (MBB->succ_size())
1802    MBB->removeSuccessor(*MBB->succ_begin());
1803}
1804
1805void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
1806    int SccNum) {
1807  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
1808  if (!srcBlkInfo)
1809    srcBlkInfo = new BlockInformation();
1810  srcBlkInfo->SccNum = SccNum;
1811}
1812
1813void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
1814  DEBUG(
1815        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
1816  );
1817
1818  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
1819
1820  if (!SrcBlkInfo)
1821    SrcBlkInfo = new BlockInformation();
1822
1823  SrcBlkInfo->IsRetired = true;
1824  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
1825         && "can't retire block yet");
1826}
1827
1828void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
1829    MachineBasicBlock *MBB) {
1830  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
1831  if (!MBB) {
1832    MBB = FuncRep->CreateMachineBasicBlock();
1833    FuncRep->push_back(MBB);  //insert to function
1834    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
1835  }
1836  TheEntry = MBB;
1837  DEBUG(
1838    dbgs() << "setLoopLandBlock loop-header = BB"
1839           << loopRep->getHeader()->getNumber()
1840           << "  landing-block = BB" << MBB->getNumber() << "\n";
1841  );
1842}
1843
1844MachineBasicBlock *
1845AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
1846    MachineBasicBlock *MBB2) {
1847
1848  if (PDT->dominates(MBB1, MBB2))
1849    return MBB1;
1850  if (PDT->dominates(MBB2, MBB1))
1851    return MBB2;
1852
1853  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
1854  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
1855
1856  // Handle newly cloned node.
1857  if (!Node1 && MBB1->succ_size() == 1)
1858    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
1859  if (!Node2 && MBB2->succ_size() == 1)
1860    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
1861
1862  if (!Node1 || !Node2)
1863    return NULL;
1864
1865  Node1 = Node1->getIDom();
1866  while (Node1) {
1867    if (PDT->dominates(Node1, Node2))
1868      return Node1->getBlock();
1869    Node1 = Node1->getIDom();
1870  }
1871
1872  return NULL;
1873}
1874
1875MachineBasicBlock *
1876AMDGPUCFGStructurizer::findNearestCommonPostDom(
1877    std::set<MachineBasicBlock *> &MBBs) {
1878  MachineBasicBlock *CommonDom;
1879  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
1880  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
1881  for (CommonDom = *It; It != E && CommonDom; ++It) {
1882    MachineBasicBlock *MBB = *It;
1883    if (MBB != CommonDom)
1884      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
1885  }
1886
1887  DEBUG(
1888    dbgs() << "Common post dominator for exit blocks is ";
1889    if (CommonDom)
1890          dbgs() << "BB" << CommonDom->getNumber() << "\n";
1891    else
1892      dbgs() << "NULL\n";
1893  );
1894
1895  return CommonDom;
1896}
1897
1898char AMDGPUCFGStructurizer::ID = 0;
1899
1900} // end anonymous namespace
1901
1902
1903INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
1904                      "AMDGPU CFG Structurizer", false, false)
1905INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
1906INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
1907INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
1908INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
1909                      "AMDGPU CFG Structurizer", false, false)
1910
1911FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
1912  return new AMDGPUCFGStructurizer();
1913}
1914