MachineScheduler.cpp revision fc7fd0992a9114ea8a04d51d03c5fa4cdc1b33bf
1//===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// MachineScheduler schedules machine instructions after phi elimination. It
11// preserves LiveIntervals so it can be invoked before register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "misched"
16
17#include "llvm/CodeGen/MachineScheduler.h"
18#include "llvm/ADT/OwningPtr.h"
19#include "llvm/ADT/PriorityQueue.h"
20#include "llvm/Analysis/AliasAnalysis.h"
21#include "llvm/CodeGen/LiveIntervalAnalysis.h"
22#include "llvm/CodeGen/MachineDominators.h"
23#include "llvm/CodeGen/MachineLoopInfo.h"
24#include "llvm/CodeGen/MachineRegisterInfo.h"
25#include "llvm/CodeGen/Passes.h"
26#include "llvm/CodeGen/RegisterClassInfo.h"
27#include "llvm/CodeGen/ScheduleDFS.h"
28#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
29#include "llvm/Support/CommandLine.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/GraphWriter.h"
33#include "llvm/Support/raw_ostream.h"
34#include "llvm/Target/TargetInstrInfo.h"
35#include <queue>
36
37using namespace llvm;
38
39namespace llvm {
40cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
41                           cl::desc("Force top-down list scheduling"));
42cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
43                            cl::desc("Force bottom-up list scheduling"));
44}
45
46#ifndef NDEBUG
47static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
48  cl::desc("Pop up a window to show MISched dags after they are processed"));
49
50static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
51  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
52#else
53static bool ViewMISchedDAGs = false;
54#endif // NDEBUG
55
56static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
57  cl::desc("Enable register pressure scheduling."), cl::init(true));
58
59static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
60  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
61
62static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
63  cl::desc("Enable load clustering."), cl::init(true));
64
65// Experimental heuristics
66static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
67  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
68
69static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
70  cl::desc("Verify machine instrs before and after machine scheduling"));
71
72// DAG subtrees must have at least this many nodes.
73static const unsigned MinSubtreeSize = 8;
74
75//===----------------------------------------------------------------------===//
76// Machine Instruction Scheduling Pass and Registry
77//===----------------------------------------------------------------------===//
78
79MachineSchedContext::MachineSchedContext():
80    MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {
81  RegClassInfo = new RegisterClassInfo();
82}
83
84MachineSchedContext::~MachineSchedContext() {
85  delete RegClassInfo;
86}
87
88namespace {
89/// MachineScheduler runs after coalescing and before register allocation.
90class MachineScheduler : public MachineSchedContext,
91                         public MachineFunctionPass {
92public:
93  MachineScheduler();
94
95  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
96
97  virtual void releaseMemory() {}
98
99  virtual bool runOnMachineFunction(MachineFunction&);
100
101  virtual void print(raw_ostream &O, const Module* = 0) const;
102
103  static char ID; // Class identification, replacement for typeinfo
104};
105} // namespace
106
107char MachineScheduler::ID = 0;
108
109char &llvm::MachineSchedulerID = MachineScheduler::ID;
110
111INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
112                      "Machine Instruction Scheduler", false, false)
113INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
114INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
115INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
116INITIALIZE_PASS_END(MachineScheduler, "misched",
117                    "Machine Instruction Scheduler", false, false)
118
119MachineScheduler::MachineScheduler()
120: MachineFunctionPass(ID) {
121  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
122}
123
124void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
125  AU.setPreservesCFG();
126  AU.addRequiredID(MachineDominatorsID);
127  AU.addRequired<MachineLoopInfo>();
128  AU.addRequired<AliasAnalysis>();
129  AU.addRequired<TargetPassConfig>();
130  AU.addRequired<SlotIndexes>();
131  AU.addPreserved<SlotIndexes>();
132  AU.addRequired<LiveIntervals>();
133  AU.addPreserved<LiveIntervals>();
134  MachineFunctionPass::getAnalysisUsage(AU);
135}
136
137MachinePassRegistry MachineSchedRegistry::Registry;
138
139/// A dummy default scheduler factory indicates whether the scheduler
140/// is overridden on the command line.
141static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
142  return 0;
143}
144
145/// MachineSchedOpt allows command line selection of the scheduler.
146static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
147               RegisterPassParser<MachineSchedRegistry> >
148MachineSchedOpt("misched",
149                cl::init(&useDefaultMachineSched), cl::Hidden,
150                cl::desc("Machine instruction scheduler to use"));
151
152static MachineSchedRegistry
153DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
154                     useDefaultMachineSched);
155
156/// Forward declare the standard machine scheduler. This will be used as the
157/// default scheduler if the target does not set a default.
158static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
159
160
161/// Decrement this iterator until reaching the top or a non-debug instr.
162static MachineBasicBlock::const_iterator
163priorNonDebug(MachineBasicBlock::const_iterator I,
164              MachineBasicBlock::const_iterator Beg) {
165  assert(I != Beg && "reached the top of the region, cannot decrement");
166  while (--I != Beg) {
167    if (!I->isDebugValue())
168      break;
169  }
170  return I;
171}
172
173/// Non-const version.
174static MachineBasicBlock::iterator
175priorNonDebug(MachineBasicBlock::iterator I,
176              MachineBasicBlock::const_iterator Beg) {
177  return const_cast<MachineInstr*>(
178    &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
179}
180
181/// If this iterator is a debug value, increment until reaching the End or a
182/// non-debug instruction.
183static MachineBasicBlock::const_iterator
184nextIfDebug(MachineBasicBlock::const_iterator I,
185            MachineBasicBlock::const_iterator End) {
186  for(; I != End; ++I) {
187    if (!I->isDebugValue())
188      break;
189  }
190  return I;
191}
192
193/// Non-const version.
194static MachineBasicBlock::iterator
195nextIfDebug(MachineBasicBlock::iterator I,
196            MachineBasicBlock::const_iterator End) {
197  // Cast the return value to nonconst MachineInstr, then cast to an
198  // instr_iterator, which does not check for null, finally return a
199  // bundle_iterator.
200  return MachineBasicBlock::instr_iterator(
201    const_cast<MachineInstr*>(
202      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
203}
204
205/// Top-level MachineScheduler pass driver.
206///
207/// Visit blocks in function order. Divide each block into scheduling regions
208/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
209/// consistent with the DAG builder, which traverses the interior of the
210/// scheduling regions bottom-up.
211///
212/// This design avoids exposing scheduling boundaries to the DAG builder,
213/// simplifying the DAG builder's support for "special" target instructions.
214/// At the same time the design allows target schedulers to operate across
215/// scheduling boundaries, for example to bundle the boudary instructions
216/// without reordering them. This creates complexity, because the target
217/// scheduler must update the RegionBegin and RegionEnd positions cached by
218/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
219/// design would be to split blocks at scheduling boundaries, but LLVM has a
220/// general bias against block splitting purely for implementation simplicity.
221bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
222  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
223
224  // Initialize the context of the pass.
225  MF = &mf;
226  MLI = &getAnalysis<MachineLoopInfo>();
227  MDT = &getAnalysis<MachineDominatorTree>();
228  PassConfig = &getAnalysis<TargetPassConfig>();
229  AA = &getAnalysis<AliasAnalysis>();
230
231  LIS = &getAnalysis<LiveIntervals>();
232  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
233
234  if (VerifyScheduling) {
235    DEBUG(LIS->dump());
236    MF->verify(this, "Before machine scheduling.");
237  }
238  RegClassInfo->runOnMachineFunction(*MF);
239
240  // Select the scheduler, or set the default.
241  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
242  if (Ctor == useDefaultMachineSched) {
243    // Get the default scheduler set by the target.
244    Ctor = MachineSchedRegistry::getDefault();
245    if (!Ctor) {
246      Ctor = createConvergingSched;
247      MachineSchedRegistry::setDefault(Ctor);
248    }
249  }
250  // Instantiate the selected scheduler.
251  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
252
253  // Visit all machine basic blocks.
254  //
255  // TODO: Visit blocks in global postorder or postorder within the bottom-up
256  // loop tree. Then we can optionally compute global RegPressure.
257  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
258       MBB != MBBEnd; ++MBB) {
259
260    Scheduler->startBlock(MBB);
261
262    // Break the block into scheduling regions [I, RegionEnd), and schedule each
263    // region as soon as it is discovered. RegionEnd points the scheduling
264    // boundary at the bottom of the region. The DAG does not include RegionEnd,
265    // but the region does (i.e. the next RegionEnd is above the previous
266    // RegionBegin). If the current block has no terminator then RegionEnd ==
267    // MBB->end() for the bottom region.
268    //
269    // The Scheduler may insert instructions during either schedule() or
270    // exitRegion(), even for empty regions. So the local iterators 'I' and
271    // 'RegionEnd' are invalid across these calls.
272    unsigned RemainingInstrs = MBB->size();
273    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
274        RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
275
276      // Avoid decrementing RegionEnd for blocks with no terminator.
277      if (RegionEnd != MBB->end()
278          || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
279        --RegionEnd;
280        // Count the boundary instruction.
281        --RemainingInstrs;
282      }
283
284      // The next region starts above the previous region. Look backward in the
285      // instruction stream until we find the nearest boundary.
286      unsigned NumRegionInstrs = 0;
287      MachineBasicBlock::iterator I = RegionEnd;
288      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
289        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
290          break;
291      }
292      // Notify the scheduler of the region, even if we may skip scheduling
293      // it. Perhaps it still needs to be bundled.
294      Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
295
296      // Skip empty scheduling regions (0 or 1 schedulable instructions).
297      if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
298        // Close the current region. Bundle the terminator if needed.
299        // This invalidates 'RegionEnd' and 'I'.
300        Scheduler->exitRegion();
301        continue;
302      }
303      DEBUG(dbgs() << "********** MI Scheduling **********\n");
304      DEBUG(dbgs() << MF->getName()
305            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
306            << "\n  From: " << *I << "    To: ";
307            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
308            else dbgs() << "End";
309            dbgs() << " RegionInstrs: " << NumRegionInstrs
310            << " Remaining: " << RemainingInstrs << "\n");
311
312      // Schedule a region: possibly reorder instructions.
313      // This invalidates 'RegionEnd' and 'I'.
314      Scheduler->schedule();
315
316      // Close the current region.
317      Scheduler->exitRegion();
318
319      // Scheduling has invalidated the current iterator 'I'. Ask the
320      // scheduler for the top of it's scheduled region.
321      RegionEnd = Scheduler->begin();
322    }
323    assert(RemainingInstrs == 0 && "Instruction count mismatch!");
324    Scheduler->finishBlock();
325  }
326  Scheduler->finalizeSchedule();
327  DEBUG(LIS->dump());
328  if (VerifyScheduling)
329    MF->verify(this, "After machine scheduling.");
330  return true;
331}
332
333void MachineScheduler::print(raw_ostream &O, const Module* m) const {
334  // unimplemented
335}
336
337#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
338void ReadyQueue::dump() {
339  dbgs() << Name << ": ";
340  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
341    dbgs() << Queue[i]->NodeNum << " ";
342  dbgs() << "\n";
343}
344#endif
345
346//===----------------------------------------------------------------------===//
347// ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals
348// preservation.
349//===----------------------------------------------------------------------===//
350
351ScheduleDAGMI::~ScheduleDAGMI() {
352  delete DFSResult;
353  DeleteContainerPointers(Mutations);
354  delete SchedImpl;
355}
356
357bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
358  return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
359}
360
361bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
362  if (SuccSU != &ExitSU) {
363    // Do not use WillCreateCycle, it assumes SD scheduling.
364    // If Pred is reachable from Succ, then the edge creates a cycle.
365    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
366      return false;
367    Topo.AddPred(SuccSU, PredDep.getSUnit());
368  }
369  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
370  // Return true regardless of whether a new edge needed to be inserted.
371  return true;
372}
373
374/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
375/// NumPredsLeft reaches zero, release the successor node.
376///
377/// FIXME: Adjust SuccSU height based on MinLatency.
378void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
379  SUnit *SuccSU = SuccEdge->getSUnit();
380
381  if (SuccEdge->isWeak()) {
382    --SuccSU->WeakPredsLeft;
383    if (SuccEdge->isCluster())
384      NextClusterSucc = SuccSU;
385    return;
386  }
387#ifndef NDEBUG
388  if (SuccSU->NumPredsLeft == 0) {
389    dbgs() << "*** Scheduling failed! ***\n";
390    SuccSU->dump(this);
391    dbgs() << " has been released too many times!\n";
392    llvm_unreachable(0);
393  }
394#endif
395  --SuccSU->NumPredsLeft;
396  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
397    SchedImpl->releaseTopNode(SuccSU);
398}
399
400/// releaseSuccessors - Call releaseSucc on each of SU's successors.
401void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
402  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
403       I != E; ++I) {
404    releaseSucc(SU, &*I);
405  }
406}
407
408/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
409/// NumSuccsLeft reaches zero, release the predecessor node.
410///
411/// FIXME: Adjust PredSU height based on MinLatency.
412void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
413  SUnit *PredSU = PredEdge->getSUnit();
414
415  if (PredEdge->isWeak()) {
416    --PredSU->WeakSuccsLeft;
417    if (PredEdge->isCluster())
418      NextClusterPred = PredSU;
419    return;
420  }
421#ifndef NDEBUG
422  if (PredSU->NumSuccsLeft == 0) {
423    dbgs() << "*** Scheduling failed! ***\n";
424    PredSU->dump(this);
425    dbgs() << " has been released too many times!\n";
426    llvm_unreachable(0);
427  }
428#endif
429  --PredSU->NumSuccsLeft;
430  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
431    SchedImpl->releaseBottomNode(PredSU);
432}
433
434/// releasePredecessors - Call releasePred on each of SU's predecessors.
435void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
436  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
437       I != E; ++I) {
438    releasePred(SU, &*I);
439  }
440}
441
442/// This is normally called from the main scheduler loop but may also be invoked
443/// by the scheduling strategy to perform additional code motion.
444void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
445                                    MachineBasicBlock::iterator InsertPos) {
446  // Advance RegionBegin if the first instruction moves down.
447  if (&*RegionBegin == MI)
448    ++RegionBegin;
449
450  // Update the instruction stream.
451  BB->splice(InsertPos, BB, MI);
452
453  // Update LiveIntervals
454  LIS->handleMove(MI, /*UpdateFlags=*/true);
455
456  // Recede RegionBegin if an instruction moves above the first.
457  if (RegionBegin == InsertPos)
458    RegionBegin = MI;
459}
460
461bool ScheduleDAGMI::checkSchedLimit() {
462#ifndef NDEBUG
463  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
464    CurrentTop = CurrentBottom;
465    return false;
466  }
467  ++NumInstrsScheduled;
468#endif
469  return true;
470}
471
472/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
473/// crossing a scheduling boundary. [begin, end) includes all instructions in
474/// the region, including the boundary itself and single-instruction regions
475/// that don't get scheduled.
476void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
477                                MachineBasicBlock::iterator begin,
478                                MachineBasicBlock::iterator end,
479                                unsigned regioninstrs)
480{
481  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
482
483  // For convenience remember the end of the liveness region.
484  LiveRegionEnd =
485    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
486
487  SUPressureDiffs.clear();
488
489  SchedImpl->initPolicy(begin, end, regioninstrs);
490
491  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
492}
493
494// Setup the register pressure trackers for the top scheduled top and bottom
495// scheduled regions.
496void ScheduleDAGMI::initRegPressure() {
497  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
498  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
499
500  // Close the RPTracker to finalize live ins.
501  RPTracker.closeRegion();
502
503  DEBUG(RPTracker.dump());
504
505  // Initialize the live ins and live outs.
506  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
507  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
508
509  // Close one end of the tracker so we can call
510  // getMaxUpward/DownwardPressureDelta before advancing across any
511  // instructions. This converts currently live regs into live ins/outs.
512  TopRPTracker.closeTop();
513  BotRPTracker.closeBottom();
514
515  BotRPTracker.initLiveThru(RPTracker);
516  if (!BotRPTracker.getLiveThru().empty()) {
517    TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
518    DEBUG(dbgs() << "Live Thru: ";
519          dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
520  };
521
522  // For each live out vreg reduce the pressure change associated with other
523  // uses of the same vreg below the live-out reaching def.
524  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
525
526  // Account for liveness generated by the region boundary.
527  if (LiveRegionEnd != RegionEnd) {
528    SmallVector<unsigned, 8> LiveUses;
529    BotRPTracker.recede(&LiveUses);
530    updatePressureDiffs(LiveUses);
531  }
532
533  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
534
535  // Cache the list of excess pressure sets in this region. This will also track
536  // the max pressure in the scheduled code for these sets.
537  RegionCriticalPSets.clear();
538  const std::vector<unsigned> &RegionPressure =
539    RPTracker.getPressure().MaxSetPressure;
540  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
541    unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
542    if (RegionPressure[i] > Limit) {
543      DEBUG(dbgs() << TRI->getRegPressureSetName(i)
544            << " Limit " << Limit
545            << " Actual " << RegionPressure[i] << "\n");
546      RegionCriticalPSets.push_back(PressureChange(i));
547    }
548  }
549  DEBUG(dbgs() << "Excess PSets: ";
550        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
551          dbgs() << TRI->getRegPressureSetName(
552            RegionCriticalPSets[i].getPSet()) << " ";
553        dbgs() << "\n");
554}
555
556void ScheduleDAGMI::
557updateScheduledPressure(const SUnit *SU,
558                        const std::vector<unsigned> &NewMaxPressure) {
559  const PressureDiff &PDiff = getPressureDiff(SU);
560  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
561  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
562       I != E; ++I) {
563    if (!I->isValid())
564      break;
565    unsigned ID = I->getPSet();
566    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
567      ++CritIdx;
568    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
569      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
570          && NewMaxPressure[ID] <= INT16_MAX)
571        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
572    }
573    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
574    if (NewMaxPressure[ID] >= Limit - 2) {
575      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
576            << NewMaxPressure[ID] << " > " << Limit << "(+ "
577            << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
578    }
579  }
580}
581
582/// Update the PressureDiff array for liveness after scheduling this
583/// instruction.
584void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
585  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
586    /// FIXME: Currently assuming single-use physregs.
587    unsigned Reg = LiveUses[LUIdx];
588    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
589    if (!TRI->isVirtualRegister(Reg))
590      continue;
591
592    // This may be called before CurrentBottom has been initialized. However,
593    // BotRPTracker must have a valid position. We want the value live into the
594    // instruction or live out of the block, so ask for the previous
595    // instruction's live-out.
596    const LiveInterval &LI = LIS->getInterval(Reg);
597    VNInfo *VNI;
598    MachineBasicBlock::const_iterator I =
599      nextIfDebug(BotRPTracker.getPos(), BB->end());
600    if (I == BB->end())
601      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
602    else {
603      LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(I));
604      VNI = LRQ.valueIn();
605    }
606    // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
607    assert(VNI && "No live value at use.");
608    for (VReg2UseMap::iterator
609           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
610      SUnit *SU = UI->SU;
611      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
612            << *SU->getInstr());
613      // If this use comes before the reaching def, it cannot be a last use, so
614      // descrease its pressure change.
615      if (!SU->isScheduled && SU != &ExitSU) {
616        LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(SU->getInstr()));
617        if (LRQ.valueIn() == VNI)
618          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
619      }
620    }
621  }
622}
623
624/// schedule - Called back from MachineScheduler::runOnMachineFunction
625/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
626/// only includes instructions that have DAG nodes, not scheduling boundaries.
627///
628/// This is a skeletal driver, with all the functionality pushed into helpers,
629/// so that it can be easilly extended by experimental schedulers. Generally,
630/// implementing MachineSchedStrategy should be sufficient to implement a new
631/// scheduling algorithm. However, if a scheduler further subclasses
632/// ScheduleDAGMI then it will want to override this virtual method in order to
633/// update any specialized state.
634void ScheduleDAGMI::schedule() {
635  buildDAGWithRegPressure();
636
637  Topo.InitDAGTopologicalSorting();
638
639  postprocessDAG();
640
641  SmallVector<SUnit*, 8> TopRoots, BotRoots;
642  findRootsAndBiasEdges(TopRoots, BotRoots);
643
644  // Initialize the strategy before modifying the DAG.
645  // This may initialize a DFSResult to be used for queue priority.
646  SchedImpl->initialize(this);
647
648  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
649          SUnits[su].dumpAll(this));
650  if (ViewMISchedDAGs) viewGraph();
651
652  // Initialize ready queues now that the DAG and priority data are finalized.
653  initQueues(TopRoots, BotRoots);
654
655  bool IsTopNode = false;
656  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
657    assert(!SU->isScheduled && "Node already scheduled");
658    if (!checkSchedLimit())
659      break;
660
661    scheduleMI(SU, IsTopNode);
662
663    updateQueues(SU, IsTopNode);
664  }
665  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
666
667  placeDebugValues();
668
669  DEBUG({
670      unsigned BBNum = begin()->getParent()->getNumber();
671      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
672      dumpSchedule();
673      dbgs() << '\n';
674    });
675}
676
677/// Build the DAG and setup three register pressure trackers.
678void ScheduleDAGMI::buildDAGWithRegPressure() {
679  if (!ShouldTrackPressure) {
680    RPTracker.reset();
681    RegionCriticalPSets.clear();
682    buildSchedGraph(AA);
683    return;
684  }
685
686  // Initialize the register pressure tracker used by buildSchedGraph.
687  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
688                 /*TrackUntiedDefs=*/true);
689
690  // Account for liveness generate by the region boundary.
691  if (LiveRegionEnd != RegionEnd)
692    RPTracker.recede();
693
694  // Build the DAG, and compute current register pressure.
695  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
696
697  // Initialize top/bottom trackers after computing region pressure.
698  initRegPressure();
699}
700
701/// Apply each ScheduleDAGMutation step in order.
702void ScheduleDAGMI::postprocessDAG() {
703  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
704    Mutations[i]->apply(this);
705  }
706}
707
708void ScheduleDAGMI::computeDFSResult() {
709  if (!DFSResult)
710    DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
711  DFSResult->clear();
712  ScheduledTrees.clear();
713  DFSResult->resize(SUnits.size());
714  DFSResult->compute(SUnits);
715  ScheduledTrees.resize(DFSResult->getNumSubtrees());
716}
717
718void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
719                                          SmallVectorImpl<SUnit*> &BotRoots) {
720  for (std::vector<SUnit>::iterator
721         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
722    SUnit *SU = &(*I);
723    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
724
725    // Order predecessors so DFSResult follows the critical path.
726    SU->biasCriticalPath();
727
728    // A SUnit is ready to top schedule if it has no predecessors.
729    if (!I->NumPredsLeft)
730      TopRoots.push_back(SU);
731    // A SUnit is ready to bottom schedule if it has no successors.
732    if (!I->NumSuccsLeft)
733      BotRoots.push_back(SU);
734  }
735  ExitSU.biasCriticalPath();
736}
737
738/// Compute the max cyclic critical path through the DAG. The scheduling DAG
739/// only provides the critical path for single block loops. To handle loops that
740/// span blocks, we could use the vreg path latencies provided by
741/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
742/// available for use in the scheduler.
743///
744/// The cyclic path estimation identifies a def-use pair that crosses the back
745/// edge and considers the depth and height of the nodes. For example, consider
746/// the following instruction sequence where each instruction has unit latency
747/// and defines an epomymous virtual register:
748///
749/// a->b(a,c)->c(b)->d(c)->exit
750///
751/// The cyclic critical path is a two cycles: b->c->b
752/// The acyclic critical path is four cycles: a->b->c->d->exit
753/// LiveOutHeight = height(c) = len(c->d->exit) = 2
754/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
755/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
756/// LiveInDepth = depth(b) = len(a->b) = 1
757///
758/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
759/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
760/// CyclicCriticalPath = min(2, 2) = 2
761unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
762  // This only applies to single block loop.
763  if (!BB->isSuccessor(BB))
764    return 0;
765
766  unsigned MaxCyclicLatency = 0;
767  // Visit each live out vreg def to find def/use pairs that cross iterations.
768  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
769  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
770       RI != RE; ++RI) {
771    unsigned Reg = *RI;
772    if (!TRI->isVirtualRegister(Reg))
773        continue;
774    const LiveInterval &LI = LIS->getInterval(Reg);
775    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
776    if (!DefVNI)
777      continue;
778
779    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
780    const SUnit *DefSU = getSUnit(DefMI);
781    if (!DefSU)
782      continue;
783
784    unsigned LiveOutHeight = DefSU->getHeight();
785    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
786    // Visit all local users of the vreg def.
787    for (VReg2UseMap::iterator
788           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
789      if (UI->SU == &ExitSU)
790        continue;
791
792      // Only consider uses of the phi.
793      LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
794      if (!LRQ.valueIn()->isPHIDef())
795        continue;
796
797      // Assume that a path spanning two iterations is a cycle, which could
798      // overestimate in strange cases. This allows cyclic latency to be
799      // estimated as the minimum slack of the vreg's depth or height.
800      unsigned CyclicLatency = 0;
801      if (LiveOutDepth > UI->SU->getDepth())
802        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
803
804      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
805      if (LiveInHeight > LiveOutHeight) {
806        if (LiveInHeight - LiveOutHeight < CyclicLatency)
807          CyclicLatency = LiveInHeight - LiveOutHeight;
808      }
809      else
810        CyclicLatency = 0;
811
812      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
813            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
814      if (CyclicLatency > MaxCyclicLatency)
815        MaxCyclicLatency = CyclicLatency;
816    }
817  }
818  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
819  return MaxCyclicLatency;
820}
821
822/// Identify DAG roots and setup scheduler queues.
823void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
824                               ArrayRef<SUnit*> BotRoots) {
825  NextClusterSucc = NULL;
826  NextClusterPred = NULL;
827
828  // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
829  //
830  // Nodes with unreleased weak edges can still be roots.
831  // Release top roots in forward order.
832  for (SmallVectorImpl<SUnit*>::const_iterator
833         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
834    SchedImpl->releaseTopNode(*I);
835  }
836  // Release bottom roots in reverse order so the higher priority nodes appear
837  // first. This is more natural and slightly more efficient.
838  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
839         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
840    SchedImpl->releaseBottomNode(*I);
841  }
842
843  releaseSuccessors(&EntrySU);
844  releasePredecessors(&ExitSU);
845
846  SchedImpl->registerRoots();
847
848  // Advance past initial DebugValues.
849  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
850  CurrentBottom = RegionEnd;
851
852  if (ShouldTrackPressure) {
853    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
854    TopRPTracker.setPos(CurrentTop);
855  }
856}
857
858/// Move an instruction and update register pressure.
859void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
860  // Move the instruction to its new location in the instruction stream.
861  MachineInstr *MI = SU->getInstr();
862
863  if (IsTopNode) {
864    assert(SU->isTopReady() && "node still has unscheduled dependencies");
865    if (&*CurrentTop == MI)
866      CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
867    else {
868      moveInstruction(MI, CurrentTop);
869      TopRPTracker.setPos(MI);
870    }
871
872    if (ShouldTrackPressure) {
873      // Update top scheduled pressure.
874      TopRPTracker.advance();
875      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
876      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
877    }
878  }
879  else {
880    assert(SU->isBottomReady() && "node still has unscheduled dependencies");
881    MachineBasicBlock::iterator priorII =
882      priorNonDebug(CurrentBottom, CurrentTop);
883    if (&*priorII == MI)
884      CurrentBottom = priorII;
885    else {
886      if (&*CurrentTop == MI) {
887        CurrentTop = nextIfDebug(++CurrentTop, priorII);
888        TopRPTracker.setPos(CurrentTop);
889      }
890      moveInstruction(MI, CurrentBottom);
891      CurrentBottom = MI;
892    }
893    if (ShouldTrackPressure) {
894      // Update bottom scheduled pressure.
895      SmallVector<unsigned, 8> LiveUses;
896      BotRPTracker.recede(&LiveUses);
897      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
898      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
899      updatePressureDiffs(LiveUses);
900    }
901  }
902}
903
904/// Update scheduler queues after scheduling an instruction.
905void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
906  // Release dependent instructions for scheduling.
907  if (IsTopNode)
908    releaseSuccessors(SU);
909  else
910    releasePredecessors(SU);
911
912  SU->isScheduled = true;
913
914  if (DFSResult) {
915    unsigned SubtreeID = DFSResult->getSubtreeID(SU);
916    if (!ScheduledTrees.test(SubtreeID)) {
917      ScheduledTrees.set(SubtreeID);
918      DFSResult->scheduleTree(SubtreeID);
919      SchedImpl->scheduleTree(SubtreeID);
920    }
921  }
922
923  // Notify the scheduling strategy after updating the DAG.
924  SchedImpl->schedNode(SU, IsTopNode);
925}
926
927/// Reinsert any remaining debug_values, just like the PostRA scheduler.
928void ScheduleDAGMI::placeDebugValues() {
929  // If first instruction was a DBG_VALUE then put it back.
930  if (FirstDbgValue) {
931    BB->splice(RegionBegin, BB, FirstDbgValue);
932    RegionBegin = FirstDbgValue;
933  }
934
935  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
936         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
937    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
938    MachineInstr *DbgValue = P.first;
939    MachineBasicBlock::iterator OrigPrevMI = P.second;
940    if (&*RegionBegin == DbgValue)
941      ++RegionBegin;
942    BB->splice(++OrigPrevMI, BB, DbgValue);
943    if (OrigPrevMI == llvm::prior(RegionEnd))
944      RegionEnd = DbgValue;
945  }
946  DbgValues.clear();
947  FirstDbgValue = NULL;
948}
949
950#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
951void ScheduleDAGMI::dumpSchedule() const {
952  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
953    if (SUnit *SU = getSUnit(&(*MI)))
954      SU->dump(this);
955    else
956      dbgs() << "Missing SUnit\n";
957  }
958}
959#endif
960
961//===----------------------------------------------------------------------===//
962// LoadClusterMutation - DAG post-processing to cluster loads.
963//===----------------------------------------------------------------------===//
964
965namespace {
966/// \brief Post-process the DAG to create cluster edges between neighboring
967/// loads.
968class LoadClusterMutation : public ScheduleDAGMutation {
969  struct LoadInfo {
970    SUnit *SU;
971    unsigned BaseReg;
972    unsigned Offset;
973    LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
974      : SU(su), BaseReg(reg), Offset(ofs) {}
975  };
976  static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
977                           const LoadClusterMutation::LoadInfo &RHS);
978
979  const TargetInstrInfo *TII;
980  const TargetRegisterInfo *TRI;
981public:
982  LoadClusterMutation(const TargetInstrInfo *tii,
983                      const TargetRegisterInfo *tri)
984    : TII(tii), TRI(tri) {}
985
986  virtual void apply(ScheduleDAGMI *DAG);
987protected:
988  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
989};
990} // anonymous
991
992bool LoadClusterMutation::LoadInfoLess(
993  const LoadClusterMutation::LoadInfo &LHS,
994  const LoadClusterMutation::LoadInfo &RHS) {
995  if (LHS.BaseReg != RHS.BaseReg)
996    return LHS.BaseReg < RHS.BaseReg;
997  return LHS.Offset < RHS.Offset;
998}
999
1000void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
1001                                                  ScheduleDAGMI *DAG) {
1002  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
1003  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
1004    SUnit *SU = Loads[Idx];
1005    unsigned BaseReg;
1006    unsigned Offset;
1007    if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
1008      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
1009  }
1010  if (LoadRecords.size() < 2)
1011    return;
1012  std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
1013  unsigned ClusterLength = 1;
1014  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
1015    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
1016      ClusterLength = 1;
1017      continue;
1018    }
1019
1020    SUnit *SUa = LoadRecords[Idx].SU;
1021    SUnit *SUb = LoadRecords[Idx+1].SU;
1022    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
1023        && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
1024
1025      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
1026            << SUb->NodeNum << ")\n");
1027      // Copy successor edges from SUa to SUb. Interleaving computation
1028      // dependent on SUa can prevent load combining due to register reuse.
1029      // Predecessor edges do not need to be copied from SUb to SUa since nearby
1030      // loads should have effectively the same inputs.
1031      for (SUnit::const_succ_iterator
1032             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
1033        if (SI->getSUnit() == SUb)
1034          continue;
1035        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
1036        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
1037      }
1038      ++ClusterLength;
1039    }
1040    else
1041      ClusterLength = 1;
1042  }
1043}
1044
1045/// \brief Callback from DAG postProcessing to create cluster edges for loads.
1046void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
1047  // Map DAG NodeNum to store chain ID.
1048  DenseMap<unsigned, unsigned> StoreChainIDs;
1049  // Map each store chain to a set of dependent loads.
1050  SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
1051  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
1052    SUnit *SU = &DAG->SUnits[Idx];
1053    if (!SU->getInstr()->mayLoad())
1054      continue;
1055    unsigned ChainPredID = DAG->SUnits.size();
1056    for (SUnit::const_pred_iterator
1057           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
1058      if (PI->isCtrl()) {
1059        ChainPredID = PI->getSUnit()->NodeNum;
1060        break;
1061      }
1062    }
1063    // Check if this chain-like pred has been seen
1064    // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
1065    unsigned NumChains = StoreChainDependents.size();
1066    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
1067      StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
1068    if (Result.second)
1069      StoreChainDependents.resize(NumChains + 1);
1070    StoreChainDependents[Result.first->second].push_back(SU);
1071  }
1072  // Iterate over the store chains.
1073  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
1074    clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
1075}
1076
1077//===----------------------------------------------------------------------===//
1078// MacroFusion - DAG post-processing to encourage fusion of macro ops.
1079//===----------------------------------------------------------------------===//
1080
1081namespace {
1082/// \brief Post-process the DAG to create cluster edges between instructions
1083/// that may be fused by the processor into a single operation.
1084class MacroFusion : public ScheduleDAGMutation {
1085  const TargetInstrInfo *TII;
1086public:
1087  MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
1088
1089  virtual void apply(ScheduleDAGMI *DAG);
1090};
1091} // anonymous
1092
1093/// \brief Callback from DAG postProcessing to create cluster edges to encourage
1094/// fused operations.
1095void MacroFusion::apply(ScheduleDAGMI *DAG) {
1096  // For now, assume targets can only fuse with the branch.
1097  MachineInstr *Branch = DAG->ExitSU.getInstr();
1098  if (!Branch)
1099    return;
1100
1101  for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
1102    SUnit *SU = &DAG->SUnits[--Idx];
1103    if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
1104      continue;
1105
1106    // Create a single weak edge from SU to ExitSU. The only effect is to cause
1107    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
1108    // need to copy predecessor edges from ExitSU to SU, since top-down
1109    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
1110    // of SU, we could create an artificial edge from the deepest root, but it
1111    // hasn't been needed yet.
1112    bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
1113    (void)Success;
1114    assert(Success && "No DAG nodes should be reachable from ExitSU");
1115
1116    DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
1117    break;
1118  }
1119}
1120
1121//===----------------------------------------------------------------------===//
1122// CopyConstrain - DAG post-processing to encourage copy elimination.
1123//===----------------------------------------------------------------------===//
1124
1125namespace {
1126/// \brief Post-process the DAG to create weak edges from all uses of a copy to
1127/// the one use that defines the copy's source vreg, most likely an induction
1128/// variable increment.
1129class CopyConstrain : public ScheduleDAGMutation {
1130  // Transient state.
1131  SlotIndex RegionBeginIdx;
1132  // RegionEndIdx is the slot index of the last non-debug instruction in the
1133  // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
1134  SlotIndex RegionEndIdx;
1135public:
1136  CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
1137
1138  virtual void apply(ScheduleDAGMI *DAG);
1139
1140protected:
1141  void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG);
1142};
1143} // anonymous
1144
1145/// constrainLocalCopy handles two possibilities:
1146/// 1) Local src:
1147/// I0:     = dst
1148/// I1: src = ...
1149/// I2:     = dst
1150/// I3: dst = src (copy)
1151/// (create pred->succ edges I0->I1, I2->I1)
1152///
1153/// 2) Local copy:
1154/// I0: dst = src (copy)
1155/// I1:     = dst
1156/// I2: src = ...
1157/// I3:     = dst
1158/// (create pred->succ edges I1->I2, I3->I2)
1159///
1160/// Although the MachineScheduler is currently constrained to single blocks,
1161/// this algorithm should handle extended blocks. An EBB is a set of
1162/// contiguously numbered blocks such that the previous block in the EBB is
1163/// always the single predecessor.
1164void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
1165  LiveIntervals *LIS = DAG->getLIS();
1166  MachineInstr *Copy = CopySU->getInstr();
1167
1168  // Check for pure vreg copies.
1169  unsigned SrcReg = Copy->getOperand(1).getReg();
1170  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
1171    return;
1172
1173  unsigned DstReg = Copy->getOperand(0).getReg();
1174  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
1175    return;
1176
1177  // Check if either the dest or source is local. If it's live across a back
1178  // edge, it's not local. Note that if both vregs are live across the back
1179  // edge, we cannot successfully contrain the copy without cyclic scheduling.
1180  unsigned LocalReg = DstReg;
1181  unsigned GlobalReg = SrcReg;
1182  LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
1183  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
1184    LocalReg = SrcReg;
1185    GlobalReg = DstReg;
1186    LocalLI = &LIS->getInterval(LocalReg);
1187    if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
1188      return;
1189  }
1190  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
1191
1192  // Find the global segment after the start of the local LI.
1193  LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
1194  // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
1195  // local live range. We could create edges from other global uses to the local
1196  // start, but the coalescer should have already eliminated these cases, so
1197  // don't bother dealing with it.
1198  if (GlobalSegment == GlobalLI->end())
1199    return;
1200
1201  // If GlobalSegment is killed at the LocalLI->start, the call to find()
1202  // returned the next global segment. But if GlobalSegment overlaps with
1203  // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
1204  // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
1205  if (GlobalSegment->contains(LocalLI->beginIndex()))
1206    ++GlobalSegment;
1207
1208  if (GlobalSegment == GlobalLI->end())
1209    return;
1210
1211  // Check if GlobalLI contains a hole in the vicinity of LocalLI.
1212  if (GlobalSegment != GlobalLI->begin()) {
1213    // Two address defs have no hole.
1214    if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
1215                               GlobalSegment->start)) {
1216      return;
1217    }
1218    // If the prior global segment may be defined by the same two-address
1219    // instruction that also defines LocalLI, then can't make a hole here.
1220    if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start,
1221                               LocalLI->beginIndex())) {
1222      return;
1223    }
1224    // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
1225    // it would be a disconnected component in the live range.
1226    assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
1227           "Disconnected LRG within the scheduling region.");
1228  }
1229  MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
1230  if (!GlobalDef)
1231    return;
1232
1233  SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
1234  if (!GlobalSU)
1235    return;
1236
1237  // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
1238  // constraining the uses of the last local def to precede GlobalDef.
1239  SmallVector<SUnit*,8> LocalUses;
1240  const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
1241  MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
1242  SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
1243  for (SUnit::const_succ_iterator
1244         I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
1245       I != E; ++I) {
1246    if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
1247      continue;
1248    if (I->getSUnit() == GlobalSU)
1249      continue;
1250    if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
1251      return;
1252    LocalUses.push_back(I->getSUnit());
1253  }
1254  // Open the top of the GlobalLI hole by constraining any earlier global uses
1255  // to precede the start of LocalLI.
1256  SmallVector<SUnit*,8> GlobalUses;
1257  MachineInstr *FirstLocalDef =
1258    LIS->getInstructionFromIndex(LocalLI->beginIndex());
1259  SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
1260  for (SUnit::const_pred_iterator
1261         I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
1262    if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
1263      continue;
1264    if (I->getSUnit() == FirstLocalSU)
1265      continue;
1266    if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
1267      return;
1268    GlobalUses.push_back(I->getSUnit());
1269  }
1270  DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
1271  // Add the weak edges.
1272  for (SmallVectorImpl<SUnit*>::const_iterator
1273         I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
1274    DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU("
1275          << GlobalSU->NodeNum << ")\n");
1276    DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
1277  }
1278  for (SmallVectorImpl<SUnit*>::const_iterator
1279         I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
1280    DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU("
1281          << FirstLocalSU->NodeNum << ")\n");
1282    DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
1283  }
1284}
1285
1286/// \brief Callback from DAG postProcessing to create weak edges to encourage
1287/// copy elimination.
1288void CopyConstrain::apply(ScheduleDAGMI *DAG) {
1289  MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
1290  if (FirstPos == DAG->end())
1291    return;
1292  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos);
1293  RegionEndIdx = DAG->getLIS()->getInstructionIndex(
1294    &*priorNonDebug(DAG->end(), DAG->begin()));
1295
1296  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
1297    SUnit *SU = &DAG->SUnits[Idx];
1298    if (!SU->getInstr()->isCopy())
1299      continue;
1300
1301    constrainLocalCopy(SU, DAG);
1302  }
1303}
1304
1305//===----------------------------------------------------------------------===//
1306// ConvergingScheduler - Implementation of the generic MachineSchedStrategy.
1307//===----------------------------------------------------------------------===//
1308
1309namespace {
1310/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
1311/// the schedule.
1312class ConvergingScheduler : public MachineSchedStrategy {
1313public:
1314  /// Represent the type of SchedCandidate found within a single queue.
1315  /// pickNodeBidirectional depends on these listed by decreasing priority.
1316  enum CandReason {
1317    NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
1318    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
1319    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
1320
1321#ifndef NDEBUG
1322  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
1323#endif
1324
1325  /// Policy for scheduling the next instruction in the candidate's zone.
1326  struct CandPolicy {
1327    bool ReduceLatency;
1328    unsigned ReduceResIdx;
1329    unsigned DemandResIdx;
1330
1331    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
1332  };
1333
1334  /// Status of an instruction's critical resource consumption.
1335  struct SchedResourceDelta {
1336    // Count critical resources in the scheduled region required by SU.
1337    unsigned CritResources;
1338
1339    // Count critical resources from another region consumed by SU.
1340    unsigned DemandedResources;
1341
1342    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
1343
1344    bool operator==(const SchedResourceDelta &RHS) const {
1345      return CritResources == RHS.CritResources
1346        && DemandedResources == RHS.DemandedResources;
1347    }
1348    bool operator!=(const SchedResourceDelta &RHS) const {
1349      return !operator==(RHS);
1350    }
1351  };
1352
1353  /// Store the state used by ConvergingScheduler heuristics, required for the
1354  /// lifetime of one invocation of pickNode().
1355  struct SchedCandidate {
1356    CandPolicy Policy;
1357
1358    // The best SUnit candidate.
1359    SUnit *SU;
1360
1361    // The reason for this candidate.
1362    CandReason Reason;
1363
1364    // Set of reasons that apply to multiple candidates.
1365    uint32_t RepeatReasonSet;
1366
1367    // Register pressure values for the best candidate.
1368    RegPressureDelta RPDelta;
1369
1370    // Critical resource consumption of the best candidate.
1371    SchedResourceDelta ResDelta;
1372
1373    SchedCandidate(const CandPolicy &policy)
1374      : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {}
1375
1376    bool isValid() const { return SU; }
1377
1378    // Copy the status of another candidate without changing policy.
1379    void setBest(SchedCandidate &Best) {
1380      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
1381      SU = Best.SU;
1382      Reason = Best.Reason;
1383      RPDelta = Best.RPDelta;
1384      ResDelta = Best.ResDelta;
1385    }
1386
1387    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
1388    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
1389
1390    void initResourceDelta(const ScheduleDAGMI *DAG,
1391                           const TargetSchedModel *SchedModel);
1392  };
1393
1394  /// Summarize the unscheduled region.
1395  struct SchedRemainder {
1396    // Critical path through the DAG in expected latency.
1397    unsigned CriticalPath;
1398    unsigned CyclicCritPath;
1399
1400    // Scaled count of micro-ops left to schedule.
1401    unsigned RemIssueCount;
1402
1403    bool IsAcyclicLatencyLimited;
1404
1405    // Unscheduled resources
1406    SmallVector<unsigned, 16> RemainingCounts;
1407
1408    void reset() {
1409      CriticalPath = 0;
1410      CyclicCritPath = 0;
1411      RemIssueCount = 0;
1412      IsAcyclicLatencyLimited = false;
1413      RemainingCounts.clear();
1414    }
1415
1416    SchedRemainder() { reset(); }
1417
1418    void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
1419  };
1420
1421  /// Each Scheduling boundary is associated with ready queues. It tracks the
1422  /// current cycle in the direction of movement, and maintains the state
1423  /// of "hazards" and other interlocks at the current cycle.
1424  struct SchedBoundary {
1425    ScheduleDAGMI *DAG;
1426    const TargetSchedModel *SchedModel;
1427    SchedRemainder *Rem;
1428
1429    ReadyQueue Available;
1430    ReadyQueue Pending;
1431    bool CheckPending;
1432
1433    // For heuristics, keep a list of the nodes that immediately depend on the
1434    // most recently scheduled node.
1435    SmallPtrSet<const SUnit*, 8> NextSUs;
1436
1437    ScheduleHazardRecognizer *HazardRec;
1438
1439    /// Number of cycles it takes to issue the instructions scheduled in this
1440    /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls.
1441    /// See getStalls().
1442    unsigned CurrCycle;
1443
1444    /// Micro-ops issued in the current cycle
1445    unsigned CurrMOps;
1446
1447    /// MinReadyCycle - Cycle of the soonest available instruction.
1448    unsigned MinReadyCycle;
1449
1450    // The expected latency of the critical path in this scheduled zone.
1451    unsigned ExpectedLatency;
1452
1453    // The latency of dependence chains leading into this zone.
1454    // For each node scheduled bottom-up: DLat = max DLat, N.Depth.
1455    // For each cycle scheduled: DLat -= 1.
1456    unsigned DependentLatency;
1457
1458    /// Count the scheduled (issued) micro-ops that can be retired by
1459    /// time=CurrCycle assuming the first scheduled instr is retired at time=0.
1460    unsigned RetiredMOps;
1461
1462    // Count scheduled resources that have been executed. Resources are
1463    // considered executed if they become ready in the time that it takes to
1464    // saturate any resource including the one in question. Counts are scaled
1465    // for direct comparison with other resources. Counts can be compared with
1466    // MOps * getMicroOpFactor and Latency * getLatencyFactor.
1467    SmallVector<unsigned, 16> ExecutedResCounts;
1468
1469    /// Cache the max count for a single resource.
1470    unsigned MaxExecutedResCount;
1471
1472    // Cache the critical resources ID in this scheduled zone.
1473    unsigned ZoneCritResIdx;
1474
1475    // Is the scheduled region resource limited vs. latency limited.
1476    bool IsResourceLimited;
1477
1478#ifndef NDEBUG
1479    // Remember the greatest operand latency as an upper bound on the number of
1480    // times we should retry the pending queue because of a hazard.
1481    unsigned MaxObservedLatency;
1482#endif
1483
1484    void reset() {
1485      // A new HazardRec is created for each DAG and owned by SchedBoundary.
1486      // Destroying and reconstructing it is very expensive though. So keep
1487      // invalid, placeholder HazardRecs.
1488      if (HazardRec && HazardRec->isEnabled()) {
1489        delete HazardRec;
1490        HazardRec = 0;
1491      }
1492      Available.clear();
1493      Pending.clear();
1494      CheckPending = false;
1495      NextSUs.clear();
1496      CurrCycle = 0;
1497      CurrMOps = 0;
1498      MinReadyCycle = UINT_MAX;
1499      ExpectedLatency = 0;
1500      DependentLatency = 0;
1501      RetiredMOps = 0;
1502      MaxExecutedResCount = 0;
1503      ZoneCritResIdx = 0;
1504      IsResourceLimited = false;
1505#ifndef NDEBUG
1506      MaxObservedLatency = 0;
1507#endif
1508      // Reserve a zero-count for invalid CritResIdx.
1509      ExecutedResCounts.resize(1);
1510      assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
1511    }
1512
1513    /// Pending queues extend the ready queues with the same ID and the
1514    /// PendingFlag set.
1515    SchedBoundary(unsigned ID, const Twine &Name):
1516      DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
1517      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
1518      HazardRec(0) {
1519      reset();
1520    }
1521
1522    ~SchedBoundary() { delete HazardRec; }
1523
1524    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
1525              SchedRemainder *rem);
1526
1527    bool isTop() const {
1528      return Available.getID() == ConvergingScheduler::TopQID;
1529    }
1530
1531#ifndef NDEBUG
1532    const char *getResourceName(unsigned PIdx) {
1533      if (!PIdx)
1534        return "MOps";
1535      return SchedModel->getProcResource(PIdx)->Name;
1536    }
1537#endif
1538
1539    /// Get the number of latency cycles "covered" by the scheduled
1540    /// instructions. This is the larger of the critical path within the zone
1541    /// and the number of cycles required to issue the instructions.
1542    unsigned getScheduledLatency() const {
1543      return std::max(ExpectedLatency, CurrCycle);
1544    }
1545
1546    unsigned getUnscheduledLatency(SUnit *SU) const {
1547      return isTop() ? SU->getHeight() : SU->getDepth();
1548    }
1549
1550    unsigned getResourceCount(unsigned ResIdx) const {
1551      return ExecutedResCounts[ResIdx];
1552    }
1553
1554    /// Get the scaled count of scheduled micro-ops and resources, including
1555    /// executed resources.
1556    unsigned getCriticalCount() const {
1557      if (!ZoneCritResIdx)
1558        return RetiredMOps * SchedModel->getMicroOpFactor();
1559      return getResourceCount(ZoneCritResIdx);
1560    }
1561
1562    /// Get a scaled count for the minimum execution time of the scheduled
1563    /// micro-ops that are ready to execute by getExecutedCount. Notice the
1564    /// feedback loop.
1565    unsigned getExecutedCount() const {
1566      return std::max(CurrCycle * SchedModel->getLatencyFactor(),
1567                      MaxExecutedResCount);
1568    }
1569
1570    bool checkHazard(SUnit *SU);
1571
1572    unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
1573
1574    unsigned getOtherResourceCount(unsigned &OtherCritIdx);
1575
1576    void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone);
1577
1578    void releaseNode(SUnit *SU, unsigned ReadyCycle);
1579
1580    void bumpCycle(unsigned NextCycle);
1581
1582    void incExecutedResources(unsigned PIdx, unsigned Count);
1583
1584    unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle);
1585
1586    void bumpNode(SUnit *SU);
1587
1588    void releasePending();
1589
1590    void removeReady(SUnit *SU);
1591
1592    SUnit *pickOnlyChoice();
1593
1594#ifndef NDEBUG
1595    void dumpScheduledState();
1596#endif
1597  };
1598
1599private:
1600  const MachineSchedContext *Context;
1601  ScheduleDAGMI *DAG;
1602  const TargetSchedModel *SchedModel;
1603  const TargetRegisterInfo *TRI;
1604
1605  // State of the top and bottom scheduled instruction boundaries.
1606  SchedRemainder Rem;
1607  SchedBoundary Top;
1608  SchedBoundary Bot;
1609
1610  MachineSchedPolicy RegionPolicy;
1611public:
1612  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
1613  enum {
1614    TopQID = 1,
1615    BotQID = 2,
1616    LogMaxQID = 2
1617  };
1618
1619  ConvergingScheduler(const MachineSchedContext *C):
1620    Context(C), DAG(0), SchedModel(0), TRI(0),
1621    Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
1622
1623  virtual void initPolicy(MachineBasicBlock::iterator Begin,
1624                          MachineBasicBlock::iterator End,
1625                          unsigned NumRegionInstrs);
1626
1627  bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; }
1628
1629  virtual void initialize(ScheduleDAGMI *dag);
1630
1631  virtual SUnit *pickNode(bool &IsTopNode);
1632
1633  virtual void schedNode(SUnit *SU, bool IsTopNode);
1634
1635  virtual void releaseTopNode(SUnit *SU);
1636
1637  virtual void releaseBottomNode(SUnit *SU);
1638
1639  virtual void registerRoots();
1640
1641protected:
1642  void checkAcyclicLatency();
1643
1644  void tryCandidate(SchedCandidate &Cand,
1645                    SchedCandidate &TryCand,
1646                    SchedBoundary &Zone,
1647                    const RegPressureTracker &RPTracker,
1648                    RegPressureTracker &TempTracker);
1649
1650  SUnit *pickNodeBidirectional(bool &IsTopNode);
1651
1652  void pickNodeFromQueue(SchedBoundary &Zone,
1653                         const RegPressureTracker &RPTracker,
1654                         SchedCandidate &Candidate);
1655
1656  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
1657
1658#ifndef NDEBUG
1659  void traceCandidate(const SchedCandidate &Cand);
1660#endif
1661};
1662} // namespace
1663
1664void ConvergingScheduler::SchedRemainder::
1665init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
1666  reset();
1667  if (!SchedModel->hasInstrSchedModel())
1668    return;
1669  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
1670  for (std::vector<SUnit>::iterator
1671         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
1672    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
1673    RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
1674      * SchedModel->getMicroOpFactor();
1675    for (TargetSchedModel::ProcResIter
1676           PI = SchedModel->getWriteProcResBegin(SC),
1677           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
1678      unsigned PIdx = PI->ProcResourceIdx;
1679      unsigned Factor = SchedModel->getResourceFactor(PIdx);
1680      RemainingCounts[PIdx] += (Factor * PI->Cycles);
1681    }
1682  }
1683}
1684
1685void ConvergingScheduler::SchedBoundary::
1686init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
1687  reset();
1688  DAG = dag;
1689  SchedModel = smodel;
1690  Rem = rem;
1691  if (SchedModel->hasInstrSchedModel())
1692    ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
1693}
1694
1695/// Initialize the per-region scheduling policy.
1696void ConvergingScheduler::initPolicy(MachineBasicBlock::iterator Begin,
1697                                     MachineBasicBlock::iterator End,
1698                                     unsigned NumRegionInstrs) {
1699  const TargetMachine &TM = Context->MF->getTarget();
1700
1701  // Avoid setting up the register pressure tracker for small regions to save
1702  // compile time. As a rough heuristic, only track pressure when the number of
1703  // schedulable instructions exceeds half the integer register file.
1704  unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
1705    TM.getTargetLowering()->getRegClassFor(MVT::i32));
1706
1707  RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
1708
1709  // For generic targets, we default to bottom-up, because it's simpler and more
1710  // compile-time optimizations have been implemented in that direction.
1711  RegionPolicy.OnlyBottomUp = true;
1712
1713  // Allow the subtarget to override default policy.
1714  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
1715  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
1716
1717  // After subtarget overrides, apply command line options.
1718  if (!EnableRegPressure)
1719    RegionPolicy.ShouldTrackPressure = false;
1720
1721  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
1722  // e.g. -misched-bottomup=false allows scheduling in both directions.
1723  assert((!ForceTopDown || !ForceBottomUp) &&
1724         "-misched-topdown incompatible with -misched-bottomup");
1725  if (ForceBottomUp.getNumOccurrences() > 0) {
1726    RegionPolicy.OnlyBottomUp = ForceBottomUp;
1727    if (RegionPolicy.OnlyBottomUp)
1728      RegionPolicy.OnlyTopDown = false;
1729  }
1730  if (ForceTopDown.getNumOccurrences() > 0) {
1731    RegionPolicy.OnlyTopDown = ForceTopDown;
1732    if (RegionPolicy.OnlyTopDown)
1733      RegionPolicy.OnlyBottomUp = false;
1734  }
1735}
1736
1737void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
1738  DAG = dag;
1739  SchedModel = DAG->getSchedModel();
1740  TRI = DAG->TRI;
1741
1742  Rem.init(DAG, SchedModel);
1743  Top.init(DAG, SchedModel, &Rem);
1744  Bot.init(DAG, SchedModel, &Rem);
1745
1746  // Initialize resource counts.
1747
1748  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
1749  // are disabled, then these HazardRecs will be disabled.
1750  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
1751  const TargetMachine &TM = DAG->MF.getTarget();
1752  if (!Top.HazardRec) {
1753    Top.HazardRec =
1754      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
1755  }
1756  if (!Bot.HazardRec) {
1757    Bot.HazardRec =
1758      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
1759  }
1760}
1761
1762void ConvergingScheduler::releaseTopNode(SUnit *SU) {
1763  if (SU->isScheduled)
1764    return;
1765
1766  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
1767       I != E; ++I) {
1768    if (I->isWeak())
1769      continue;
1770    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
1771    unsigned Latency = I->getLatency();
1772#ifndef NDEBUG
1773    Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency);
1774#endif
1775    if (SU->TopReadyCycle < PredReadyCycle + Latency)
1776      SU->TopReadyCycle = PredReadyCycle + Latency;
1777  }
1778  Top.releaseNode(SU, SU->TopReadyCycle);
1779}
1780
1781void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
1782  if (SU->isScheduled)
1783    return;
1784
1785  assert(SU->getInstr() && "Scheduled SUnit must have instr");
1786
1787  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
1788       I != E; ++I) {
1789    if (I->isWeak())
1790      continue;
1791    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
1792    unsigned Latency = I->getLatency();
1793#ifndef NDEBUG
1794    Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency);
1795#endif
1796    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
1797      SU->BotReadyCycle = SuccReadyCycle + Latency;
1798  }
1799  Bot.releaseNode(SU, SU->BotReadyCycle);
1800}
1801
1802/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
1803/// critical path by more cycles than it takes to drain the instruction buffer.
1804/// We estimate an upper bounds on in-flight instructions as:
1805///
1806/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
1807/// InFlightIterations = AcyclicPath / CyclesPerIteration
1808/// InFlightResources = InFlightIterations * LoopResources
1809///
1810/// TODO: Check execution resources in addition to IssueCount.
1811void ConvergingScheduler::checkAcyclicLatency() {
1812  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
1813    return;
1814
1815  // Scaled number of cycles per loop iteration.
1816  unsigned IterCount =
1817    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
1818             Rem.RemIssueCount);
1819  // Scaled acyclic critical path.
1820  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
1821  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
1822  unsigned InFlightCount =
1823    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
1824  unsigned BufferLimit =
1825    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
1826
1827  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
1828
1829  DEBUG(dbgs() << "IssueCycles="
1830        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
1831        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
1832        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
1833        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
1834        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
1835        if (Rem.IsAcyclicLatencyLimited)
1836          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
1837}
1838
1839void ConvergingScheduler::registerRoots() {
1840  Rem.CriticalPath = DAG->ExitSU.getDepth();
1841
1842  // Some roots may not feed into ExitSU. Check all of them in case.
1843  for (std::vector<SUnit*>::const_iterator
1844         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
1845    if ((*I)->getDepth() > Rem.CriticalPath)
1846      Rem.CriticalPath = (*I)->getDepth();
1847  }
1848  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
1849
1850  if (EnableCyclicPath) {
1851    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
1852    checkAcyclicLatency();
1853  }
1854}
1855
1856/// Does this SU have a hazard within the current instruction group.
1857///
1858/// The scheduler supports two modes of hazard recognition. The first is the
1859/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
1860/// supports highly complicated in-order reservation tables
1861/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
1862///
1863/// The second is a streamlined mechanism that checks for hazards based on
1864/// simple counters that the scheduler itself maintains. It explicitly checks
1865/// for instruction dispatch limitations, including the number of micro-ops that
1866/// can dispatch per cycle.
1867///
1868/// TODO: Also check whether the SU must start a new group.
1869bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
1870  if (HazardRec->isEnabled())
1871    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
1872
1873  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
1874  if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
1875    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
1876          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
1877    return true;
1878  }
1879  return false;
1880}
1881
1882// Find the unscheduled node in ReadySUs with the highest latency.
1883unsigned ConvergingScheduler::SchedBoundary::
1884findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
1885  SUnit *LateSU = 0;
1886  unsigned RemLatency = 0;
1887  for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
1888       I != E; ++I) {
1889    unsigned L = getUnscheduledLatency(*I);
1890    if (L > RemLatency) {
1891      RemLatency = L;
1892      LateSU = *I;
1893    }
1894  }
1895  if (LateSU) {
1896    DEBUG(dbgs() << Available.getName() << " RemLatency SU("
1897          << LateSU->NodeNum << ") " << RemLatency << "c\n");
1898  }
1899  return RemLatency;
1900}
1901
1902// Count resources in this zone and the remaining unscheduled
1903// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
1904// resource index, or zero if the zone is issue limited.
1905unsigned ConvergingScheduler::SchedBoundary::
1906getOtherResourceCount(unsigned &OtherCritIdx) {
1907  OtherCritIdx = 0;
1908  if (!SchedModel->hasInstrSchedModel())
1909    return 0;
1910
1911  unsigned OtherCritCount = Rem->RemIssueCount
1912    + (RetiredMOps * SchedModel->getMicroOpFactor());
1913  DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
1914        << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
1915  for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
1916       PIdx != PEnd; ++PIdx) {
1917    unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
1918    if (OtherCount > OtherCritCount) {
1919      OtherCritCount = OtherCount;
1920      OtherCritIdx = PIdx;
1921    }
1922  }
1923  if (OtherCritIdx) {
1924    DEBUG(dbgs() << "  " << Available.getName() << " + Remain CritRes: "
1925          << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
1926          << " " << getResourceName(OtherCritIdx) << "\n");
1927  }
1928  return OtherCritCount;
1929}
1930
1931/// Set the CandPolicy for this zone given the current resources and latencies
1932/// inside and outside the zone.
1933void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
1934                                                   SchedBoundary &OtherZone) {
1935  // Now that potential stalls have been considered, apply preemptive heuristics
1936  // based on the the total latency and resources inside and outside this
1937  // zone.
1938
1939  // Compute remaining latency. We need this both to determine whether the
1940  // overall schedule has become latency-limited and whether the instructions
1941  // outside this zone are resource or latency limited.
1942  //
1943  // The "dependent" latency is updated incrementally during scheduling as the
1944  // max height/depth of scheduled nodes minus the cycles since it was
1945  // scheduled:
1946  //   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
1947  //
1948  // The "independent" latency is the max ready queue depth:
1949  //   ILat = max N.depth for N in Available|Pending
1950  //
1951  // RemainingLatency is the greater of independent and dependent latency.
1952  unsigned RemLatency = DependentLatency;
1953  RemLatency = std::max(RemLatency, findMaxLatency(Available.elements()));
1954  RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements()));
1955
1956  // Compute the critical resource outside the zone.
1957  unsigned OtherCritIdx;
1958  unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx);
1959
1960  bool OtherResLimited = false;
1961  if (SchedModel->hasInstrSchedModel()) {
1962    unsigned LFactor = SchedModel->getLatencyFactor();
1963    OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor;
1964  }
1965  if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) {
1966    Policy.ReduceLatency |= true;
1967    DEBUG(dbgs() << "  " << Available.getName() << " RemainingLatency "
1968          << RemLatency << " + " << CurrCycle << "c > CritPath "
1969          << Rem->CriticalPath << "\n");
1970  }
1971  // If the same resource is limiting inside and outside the zone, do nothing.
1972  if (ZoneCritResIdx == OtherCritIdx)
1973    return;
1974
1975  DEBUG(
1976    if (IsResourceLimited) {
1977      dbgs() << "  " << Available.getName() << " ResourceLimited: "
1978             << getResourceName(ZoneCritResIdx) << "\n";
1979    }
1980    if (OtherResLimited)
1981      dbgs() << "  RemainingLimit: " << getResourceName(OtherCritIdx) << "\n";
1982    if (!IsResourceLimited && !OtherResLimited)
1983      dbgs() << "  Latency limited both directions.\n");
1984
1985  if (IsResourceLimited && !Policy.ReduceResIdx)
1986    Policy.ReduceResIdx = ZoneCritResIdx;
1987
1988  if (OtherResLimited)
1989    Policy.DemandResIdx = OtherCritIdx;
1990}
1991
1992void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
1993                                                     unsigned ReadyCycle) {
1994  if (ReadyCycle < MinReadyCycle)
1995    MinReadyCycle = ReadyCycle;
1996
1997  // Check for interlocks first. For the purpose of other heuristics, an
1998  // instruction that cannot issue appears as if it's not in the ReadyQueue.
1999  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2000  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU))
2001    Pending.push(SU);
2002  else
2003    Available.push(SU);
2004
2005  // Record this node as an immediate dependent of the scheduled node.
2006  NextSUs.insert(SU);
2007}
2008
2009/// Move the boundary of scheduled code by one cycle.
2010void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
2011  if (SchedModel->getMicroOpBufferSize() == 0) {
2012    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
2013    if (MinReadyCycle > NextCycle)
2014      NextCycle = MinReadyCycle;
2015  }
2016  // Update the current micro-ops, which will issue in the next cycle.
2017  unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
2018  CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
2019
2020  // Decrement DependentLatency based on the next cycle.
2021  if ((NextCycle - CurrCycle) > DependentLatency)
2022    DependentLatency = 0;
2023  else
2024    DependentLatency -= (NextCycle - CurrCycle);
2025
2026  if (!HazardRec->isEnabled()) {
2027    // Bypass HazardRec virtual calls.
2028    CurrCycle = NextCycle;
2029  }
2030  else {
2031    // Bypass getHazardType calls in case of long latency.
2032    for (; CurrCycle != NextCycle; ++CurrCycle) {
2033      if (isTop())
2034        HazardRec->AdvanceCycle();
2035      else
2036        HazardRec->RecedeCycle();
2037    }
2038  }
2039  CheckPending = true;
2040  unsigned LFactor = SchedModel->getLatencyFactor();
2041  IsResourceLimited =
2042    (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
2043    > (int)LFactor;
2044
2045  DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
2046}
2047
2048void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
2049                                                              unsigned Count) {
2050  ExecutedResCounts[PIdx] += Count;
2051  if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
2052    MaxExecutedResCount = ExecutedResCounts[PIdx];
2053}
2054
2055/// Add the given processor resource to this scheduled zone.
2056///
2057/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
2058/// during which this resource is consumed.
2059///
2060/// \return the next cycle at which the instruction may execute without
2061/// oversubscribing resources.
2062unsigned ConvergingScheduler::SchedBoundary::
2063countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
2064  unsigned Factor = SchedModel->getResourceFactor(PIdx);
2065  unsigned Count = Factor * Cycles;
2066  DEBUG(dbgs() << "  " << getResourceName(PIdx)
2067        << " +" << Cycles << "x" << Factor << "u\n");
2068
2069  // Update Executed resources counts.
2070  incExecutedResources(PIdx, Count);
2071  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
2072  Rem->RemainingCounts[PIdx] -= Count;
2073
2074  // Check if this resource exceeds the current critical resource. If so, it
2075  // becomes the critical resource.
2076  if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
2077    ZoneCritResIdx = PIdx;
2078    DEBUG(dbgs() << "  *** Critical resource "
2079          << getResourceName(PIdx) << ": "
2080          << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
2081  }
2082  // TODO: We don't yet model reserved resources. It's not hard though.
2083  return CurrCycle;
2084}
2085
2086/// Move the boundary of scheduled code by one SUnit.
2087void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
2088  // Update the reservation table.
2089  if (HazardRec->isEnabled()) {
2090    if (!isTop() && SU->isCall) {
2091      // Calls are scheduled with their preceding instructions. For bottom-up
2092      // scheduling, clear the pipeline state before emitting.
2093      HazardRec->Reset();
2094    }
2095    HazardRec->EmitInstruction(SU);
2096  }
2097  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2098  unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
2099  CurrMOps += IncMOps;
2100  // checkHazard prevents scheduling multiple instructions per cycle that exceed
2101  // issue width. However, we commonly reach the maximum. In this case
2102  // opportunistically bump the cycle to avoid uselessly checking everything in
2103  // the readyQ. Furthermore, a single instruction may produce more than one
2104  // cycle's worth of micro-ops.
2105  //
2106  // TODO: Also check if this SU must end a dispatch group.
2107  unsigned NextCycle = CurrCycle;
2108  if (CurrMOps >= SchedModel->getIssueWidth()) {
2109    ++NextCycle;
2110    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
2111          << " at cycle " << CurrCycle << '\n');
2112  }
2113  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2114  DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
2115
2116  switch (SchedModel->getMicroOpBufferSize()) {
2117  case 0:
2118    assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
2119    break;
2120  case 1:
2121    if (ReadyCycle > NextCycle) {
2122      NextCycle = ReadyCycle;
2123      DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
2124    }
2125    break;
2126  default:
2127    // We don't currently model the OOO reorder buffer, so consider all
2128    // scheduled MOps to be "retired".
2129    break;
2130  }
2131  RetiredMOps += IncMOps;
2132
2133  // Update resource counts and critical resource.
2134  if (SchedModel->hasInstrSchedModel()) {
2135    unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
2136    assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
2137    Rem->RemIssueCount -= DecRemIssue;
2138    if (ZoneCritResIdx) {
2139      // Scale scheduled micro-ops for comparing with the critical resource.
2140      unsigned ScaledMOps =
2141        RetiredMOps * SchedModel->getMicroOpFactor();
2142
2143      // If scaled micro-ops are now more than the previous critical resource by
2144      // a full cycle, then micro-ops issue becomes critical.
2145      if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
2146          >= (int)SchedModel->getLatencyFactor()) {
2147        ZoneCritResIdx = 0;
2148        DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
2149              << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
2150      }
2151    }
2152    for (TargetSchedModel::ProcResIter
2153           PI = SchedModel->getWriteProcResBegin(SC),
2154           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2155      unsigned RCycle =
2156        countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle);
2157      if (RCycle > NextCycle)
2158        NextCycle = RCycle;
2159    }
2160  }
2161  // Update ExpectedLatency and DependentLatency.
2162  unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
2163  unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
2164  if (SU->getDepth() > TopLatency) {
2165    TopLatency = SU->getDepth();
2166    DEBUG(dbgs() << "  " << Available.getName()
2167          << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
2168  }
2169  if (SU->getHeight() > BotLatency) {
2170    BotLatency = SU->getHeight();
2171    DEBUG(dbgs() << "  " << Available.getName()
2172          << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
2173  }
2174  // If we stall for any reason, bump the cycle.
2175  if (NextCycle > CurrCycle) {
2176    bumpCycle(NextCycle);
2177  }
2178  else {
2179    // After updating ZoneCritResIdx and ExpectedLatency, check if we're
2180    // resource limited. If a stall occured, bumpCycle does this.
2181    unsigned LFactor = SchedModel->getLatencyFactor();
2182    IsResourceLimited =
2183      (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
2184      > (int)LFactor;
2185  }
2186  DEBUG(dumpScheduledState());
2187}
2188
2189/// Release pending ready nodes in to the available queue. This makes them
2190/// visible to heuristics.
2191void ConvergingScheduler::SchedBoundary::releasePending() {
2192  // If the available queue is empty, it is safe to reset MinReadyCycle.
2193  if (Available.empty())
2194    MinReadyCycle = UINT_MAX;
2195
2196  // Check to see if any of the pending instructions are ready to issue.  If
2197  // so, add them to the available queue.
2198  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2199  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
2200    SUnit *SU = *(Pending.begin()+i);
2201    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
2202
2203    if (ReadyCycle < MinReadyCycle)
2204      MinReadyCycle = ReadyCycle;
2205
2206    if (!IsBuffered && ReadyCycle > CurrCycle)
2207      continue;
2208
2209    if (checkHazard(SU))
2210      continue;
2211
2212    Available.push(SU);
2213    Pending.remove(Pending.begin()+i);
2214    --i; --e;
2215  }
2216  DEBUG(if (!Pending.empty()) Pending.dump());
2217  CheckPending = false;
2218}
2219
2220/// Remove SU from the ready set for this boundary.
2221void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
2222  if (Available.isInQueue(SU))
2223    Available.remove(Available.find(SU));
2224  else {
2225    assert(Pending.isInQueue(SU) && "bad ready count");
2226    Pending.remove(Pending.find(SU));
2227  }
2228}
2229
2230/// If this queue only has one ready candidate, return it. As a side effect,
2231/// defer any nodes that now hit a hazard, and advance the cycle until at least
2232/// one node is ready. If multiple instructions are ready, return NULL.
2233SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
2234  if (CheckPending)
2235    releasePending();
2236
2237  if (CurrMOps > 0) {
2238    // Defer any ready instrs that now have a hazard.
2239    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
2240      if (checkHazard(*I)) {
2241        Pending.push(*I);
2242        I = Available.remove(I);
2243        continue;
2244      }
2245      ++I;
2246    }
2247  }
2248  for (unsigned i = 0; Available.empty(); ++i) {
2249    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
2250           "permanent hazard"); (void)i;
2251    bumpCycle(CurrCycle + 1);
2252    releasePending();
2253  }
2254  if (Available.size() == 1)
2255    return *Available.begin();
2256  return NULL;
2257}
2258
2259#ifndef NDEBUG
2260// This is useful information to dump after bumpNode.
2261// Note that the Queue contents are more useful before pickNodeFromQueue.
2262void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
2263  unsigned ResFactor;
2264  unsigned ResCount;
2265  if (ZoneCritResIdx) {
2266    ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
2267    ResCount = getResourceCount(ZoneCritResIdx);
2268  }
2269  else {
2270    ResFactor = SchedModel->getMicroOpFactor();
2271    ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
2272  }
2273  unsigned LFactor = SchedModel->getLatencyFactor();
2274  dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
2275         << "  Retired: " << RetiredMOps;
2276  dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
2277  dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
2278         << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx)
2279         << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
2280         << (IsResourceLimited ? "  - Resource" : "  - Latency")
2281         << " limited.\n";
2282}
2283#endif
2284
2285void ConvergingScheduler::SchedCandidate::
2286initResourceDelta(const ScheduleDAGMI *DAG,
2287                  const TargetSchedModel *SchedModel) {
2288  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2289    return;
2290
2291  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2292  for (TargetSchedModel::ProcResIter
2293         PI = SchedModel->getWriteProcResBegin(SC),
2294         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2295    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2296      ResDelta.CritResources += PI->Cycles;
2297    if (PI->ProcResourceIdx == Policy.DemandResIdx)
2298      ResDelta.DemandedResources += PI->Cycles;
2299  }
2300}
2301
2302
2303/// Return true if this heuristic determines order.
2304static bool tryLess(int TryVal, int CandVal,
2305                    ConvergingScheduler::SchedCandidate &TryCand,
2306                    ConvergingScheduler::SchedCandidate &Cand,
2307                    ConvergingScheduler::CandReason Reason) {
2308  if (TryVal < CandVal) {
2309    TryCand.Reason = Reason;
2310    return true;
2311  }
2312  if (TryVal > CandVal) {
2313    if (Cand.Reason > Reason)
2314      Cand.Reason = Reason;
2315    return true;
2316  }
2317  Cand.setRepeat(Reason);
2318  return false;
2319}
2320
2321static bool tryGreater(int TryVal, int CandVal,
2322                       ConvergingScheduler::SchedCandidate &TryCand,
2323                       ConvergingScheduler::SchedCandidate &Cand,
2324                       ConvergingScheduler::CandReason Reason) {
2325  if (TryVal > CandVal) {
2326    TryCand.Reason = Reason;
2327    return true;
2328  }
2329  if (TryVal < CandVal) {
2330    if (Cand.Reason > Reason)
2331      Cand.Reason = Reason;
2332    return true;
2333  }
2334  Cand.setRepeat(Reason);
2335  return false;
2336}
2337
2338static bool tryPressure(const PressureChange &TryP,
2339                        const PressureChange &CandP,
2340                        ConvergingScheduler::SchedCandidate &TryCand,
2341                        ConvergingScheduler::SchedCandidate &Cand,
2342                        ConvergingScheduler::CandReason Reason) {
2343  int TryRank = TryP.getPSetOrMax();
2344  int CandRank = CandP.getPSetOrMax();
2345  // If both candidates affect the same set, go with the smallest increase.
2346  if (TryRank == CandRank) {
2347    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
2348                   Reason);
2349  }
2350  // If one candidate decreases and the other increases, go with it.
2351  // Invalid candidates have UnitInc==0.
2352  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
2353              Reason)) {
2354    return true;
2355  }
2356  // If the candidates are decreasing pressure, reverse priority.
2357  if (TryP.getUnitInc() < 0)
2358    std::swap(TryRank, CandRank);
2359  return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
2360}
2361
2362static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
2363  return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
2364}
2365
2366/// Minimize physical register live ranges. Regalloc wants them adjacent to
2367/// their physreg def/use.
2368///
2369/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
2370/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
2371/// with the operation that produces or consumes the physreg. We'll do this when
2372/// regalloc has support for parallel copies.
2373static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
2374  const MachineInstr *MI = SU->getInstr();
2375  if (!MI->isCopy())
2376    return 0;
2377
2378  unsigned ScheduledOper = isTop ? 1 : 0;
2379  unsigned UnscheduledOper = isTop ? 0 : 1;
2380  // If we have already scheduled the physreg produce/consumer, immediately
2381  // schedule the copy.
2382  if (TargetRegisterInfo::isPhysicalRegister(
2383        MI->getOperand(ScheduledOper).getReg()))
2384    return 1;
2385  // If the physreg is at the boundary, defer it. Otherwise schedule it
2386  // immediately to free the dependent. We can hoist the copy later.
2387  bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
2388  if (TargetRegisterInfo::isPhysicalRegister(
2389        MI->getOperand(UnscheduledOper).getReg()))
2390    return AtBoundary ? -1 : 1;
2391  return 0;
2392}
2393
2394static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
2395                       ConvergingScheduler::SchedCandidate &Cand,
2396                       ConvergingScheduler::SchedBoundary &Zone) {
2397  if (Zone.isTop()) {
2398    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
2399      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2400                  TryCand, Cand, ConvergingScheduler::TopDepthReduce))
2401        return true;
2402    }
2403    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2404                   TryCand, Cand, ConvergingScheduler::TopPathReduce))
2405      return true;
2406  }
2407  else {
2408    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
2409      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2410                  TryCand, Cand, ConvergingScheduler::BotHeightReduce))
2411        return true;
2412    }
2413    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2414                   TryCand, Cand, ConvergingScheduler::BotPathReduce))
2415      return true;
2416  }
2417  return false;
2418}
2419
2420/// Apply a set of heursitics to a new candidate. Heuristics are currently
2421/// hierarchical. This may be more efficient than a graduated cost model because
2422/// we don't need to evaluate all aspects of the model for each node in the
2423/// queue. But it's really done to make the heuristics easier to debug and
2424/// statistically analyze.
2425///
2426/// \param Cand provides the policy and current best candidate.
2427/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
2428/// \param Zone describes the scheduled zone that we are extending.
2429/// \param RPTracker describes reg pressure within the scheduled zone.
2430/// \param TempTracker is a scratch pressure tracker to reuse in queries.
2431void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
2432                                       SchedCandidate &TryCand,
2433                                       SchedBoundary &Zone,
2434                                       const RegPressureTracker &RPTracker,
2435                                       RegPressureTracker &TempTracker) {
2436
2437  if (DAG->isTrackingPressure()) {
2438    // Always initialize TryCand's RPDelta.
2439    if (Zone.isTop()) {
2440      TempTracker.getMaxDownwardPressureDelta(
2441        TryCand.SU->getInstr(),
2442        TryCand.RPDelta,
2443        DAG->getRegionCriticalPSets(),
2444        DAG->getRegPressure().MaxSetPressure);
2445    }
2446    else {
2447      if (VerifyScheduling) {
2448        TempTracker.getMaxUpwardPressureDelta(
2449          TryCand.SU->getInstr(),
2450          &DAG->getPressureDiff(TryCand.SU),
2451          TryCand.RPDelta,
2452          DAG->getRegionCriticalPSets(),
2453          DAG->getRegPressure().MaxSetPressure);
2454      }
2455      else {
2456        RPTracker.getUpwardPressureDelta(
2457          TryCand.SU->getInstr(),
2458          DAG->getPressureDiff(TryCand.SU),
2459          TryCand.RPDelta,
2460          DAG->getRegionCriticalPSets(),
2461          DAG->getRegPressure().MaxSetPressure);
2462      }
2463    }
2464  }
2465  DEBUG(if (TryCand.RPDelta.Excess.isValid())
2466          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
2467                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
2468                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
2469
2470  // Initialize the candidate if needed.
2471  if (!Cand.isValid()) {
2472    TryCand.Reason = NodeOrder;
2473    return;
2474  }
2475
2476  if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
2477                 biasPhysRegCopy(Cand.SU, Zone.isTop()),
2478                 TryCand, Cand, PhysRegCopy))
2479    return;
2480
2481  // Avoid exceeding the target's limit. If signed PSetID is negative, it is
2482  // invalid; convert it to INT_MAX to give it lowest priority.
2483  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
2484                                               Cand.RPDelta.Excess,
2485                                               TryCand, Cand, RegExcess))
2486    return;
2487
2488  // Avoid increasing the max critical pressure in the scheduled region.
2489  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
2490                                               Cand.RPDelta.CriticalMax,
2491                                               TryCand, Cand, RegCritical))
2492    return;
2493
2494  // For loops that are acyclic path limited, aggressively schedule for latency.
2495  // This can result in very long dependence chains scheduled in sequence, so
2496  // once every cycle (when CurrMOps == 0), switch to normal heuristics.
2497  if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps
2498      && tryLatency(TryCand, Cand, Zone))
2499    return;
2500
2501  // Keep clustered nodes together to encourage downstream peephole
2502  // optimizations which may reduce resource requirements.
2503  //
2504  // This is a best effort to set things up for a post-RA pass. Optimizations
2505  // like generating loads of multiple registers should ideally be done within
2506  // the scheduler pass by combining the loads during DAG postprocessing.
2507  const SUnit *NextClusterSU =
2508    Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
2509  if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
2510                 TryCand, Cand, Cluster))
2511    return;
2512
2513  // Weak edges are for clustering and other constraints.
2514  if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
2515              getWeakLeft(Cand.SU, Zone.isTop()),
2516              TryCand, Cand, Weak)) {
2517    return;
2518  }
2519  // Avoid increasing the max pressure of the entire region.
2520  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
2521                                               Cand.RPDelta.CurrentMax,
2522                                               TryCand, Cand, RegMax))
2523    return;
2524
2525  // Avoid critical resource consumption and balance the schedule.
2526  TryCand.initResourceDelta(DAG, SchedModel);
2527  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
2528              TryCand, Cand, ResourceReduce))
2529    return;
2530  if (tryGreater(TryCand.ResDelta.DemandedResources,
2531                 Cand.ResDelta.DemandedResources,
2532                 TryCand, Cand, ResourceDemand))
2533    return;
2534
2535  // Avoid serializing long latency dependence chains.
2536  // For acyclic path limited loops, latency was already checked above.
2537  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
2538      && tryLatency(TryCand, Cand, Zone)) {
2539    return;
2540  }
2541
2542  // Prefer immediate defs/users of the last scheduled instruction. This is a
2543  // local pressure avoidance strategy that also makes the machine code
2544  // readable.
2545  if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
2546                 TryCand, Cand, NextDefUse))
2547    return;
2548
2549  // Fall through to original instruction order.
2550  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
2551      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
2552    TryCand.Reason = NodeOrder;
2553  }
2554}
2555
2556#ifndef NDEBUG
2557const char *ConvergingScheduler::getReasonStr(
2558  ConvergingScheduler::CandReason Reason) {
2559  switch (Reason) {
2560  case NoCand:         return "NOCAND    ";
2561  case PhysRegCopy:    return "PREG-COPY";
2562  case RegExcess:      return "REG-EXCESS";
2563  case RegCritical:    return "REG-CRIT  ";
2564  case Cluster:        return "CLUSTER   ";
2565  case Weak:           return "WEAK      ";
2566  case RegMax:         return "REG-MAX   ";
2567  case ResourceReduce: return "RES-REDUCE";
2568  case ResourceDemand: return "RES-DEMAND";
2569  case TopDepthReduce: return "TOP-DEPTH ";
2570  case TopPathReduce:  return "TOP-PATH  ";
2571  case BotHeightReduce:return "BOT-HEIGHT";
2572  case BotPathReduce:  return "BOT-PATH  ";
2573  case NextDefUse:     return "DEF-USE   ";
2574  case NodeOrder:      return "ORDER     ";
2575  };
2576  llvm_unreachable("Unknown reason!");
2577}
2578
2579void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
2580  PressureChange P;
2581  unsigned ResIdx = 0;
2582  unsigned Latency = 0;
2583  switch (Cand.Reason) {
2584  default:
2585    break;
2586  case RegExcess:
2587    P = Cand.RPDelta.Excess;
2588    break;
2589  case RegCritical:
2590    P = Cand.RPDelta.CriticalMax;
2591    break;
2592  case RegMax:
2593    P = Cand.RPDelta.CurrentMax;
2594    break;
2595  case ResourceReduce:
2596    ResIdx = Cand.Policy.ReduceResIdx;
2597    break;
2598  case ResourceDemand:
2599    ResIdx = Cand.Policy.DemandResIdx;
2600    break;
2601  case TopDepthReduce:
2602    Latency = Cand.SU->getDepth();
2603    break;
2604  case TopPathReduce:
2605    Latency = Cand.SU->getHeight();
2606    break;
2607  case BotHeightReduce:
2608    Latency = Cand.SU->getHeight();
2609    break;
2610  case BotPathReduce:
2611    Latency = Cand.SU->getDepth();
2612    break;
2613  }
2614  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
2615  if (P.isValid())
2616    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
2617           << ":" << P.getUnitInc() << " ";
2618  else
2619    dbgs() << "      ";
2620  if (ResIdx)
2621    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
2622  else
2623    dbgs() << "         ";
2624  if (Latency)
2625    dbgs() << " " << Latency << " cycles ";
2626  else
2627    dbgs() << "          ";
2628  dbgs() << '\n';
2629}
2630#endif
2631
2632/// Pick the best candidate from the queue.
2633///
2634/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
2635/// DAG building. To adjust for the current scheduling location we need to
2636/// maintain the number of vreg uses remaining to be top-scheduled.
2637void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
2638                                            const RegPressureTracker &RPTracker,
2639                                            SchedCandidate &Cand) {
2640  ReadyQueue &Q = Zone.Available;
2641
2642  DEBUG(Q.dump());
2643
2644  // getMaxPressureDelta temporarily modifies the tracker.
2645  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
2646
2647  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
2648
2649    SchedCandidate TryCand(Cand.Policy);
2650    TryCand.SU = *I;
2651    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
2652    if (TryCand.Reason != NoCand) {
2653      // Initialize resource delta if needed in case future heuristics query it.
2654      if (TryCand.ResDelta == SchedResourceDelta())
2655        TryCand.initResourceDelta(DAG, SchedModel);
2656      Cand.setBest(TryCand);
2657      DEBUG(traceCandidate(Cand));
2658    }
2659  }
2660}
2661
2662static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
2663                      bool IsTop) {
2664  DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2665        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
2666}
2667
2668/// Pick the best candidate node from either the top or bottom queue.
2669SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
2670  // Schedule as far as possible in the direction of no choice. This is most
2671  // efficient, but also provides the best heuristics for CriticalPSets.
2672  if (SUnit *SU = Bot.pickOnlyChoice()) {
2673    IsTopNode = false;
2674    DEBUG(dbgs() << "Pick Bot NOCAND\n");
2675    return SU;
2676  }
2677  if (SUnit *SU = Top.pickOnlyChoice()) {
2678    IsTopNode = true;
2679    DEBUG(dbgs() << "Pick Top NOCAND\n");
2680    return SU;
2681  }
2682  CandPolicy NoPolicy;
2683  SchedCandidate BotCand(NoPolicy);
2684  SchedCandidate TopCand(NoPolicy);
2685  Bot.setPolicy(BotCand.Policy, Top);
2686  Top.setPolicy(TopCand.Policy, Bot);
2687
2688  // Prefer bottom scheduling when heuristics are silent.
2689  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
2690  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
2691
2692  // If either Q has a single candidate that provides the least increase in
2693  // Excess pressure, we can immediately schedule from that Q.
2694  //
2695  // RegionCriticalPSets summarizes the pressure within the scheduled region and
2696  // affects picking from either Q. If scheduling in one direction must
2697  // increase pressure for one of the excess PSets, then schedule in that
2698  // direction first to provide more freedom in the other direction.
2699  if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess))
2700      || (BotCand.Reason == RegCritical
2701          && !BotCand.isRepeat(RegCritical)))
2702  {
2703    IsTopNode = false;
2704    tracePick(BotCand, IsTopNode);
2705    return BotCand.SU;
2706  }
2707  // Check if the top Q has a better candidate.
2708  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
2709  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
2710
2711  // Choose the queue with the most important (lowest enum) reason.
2712  if (TopCand.Reason < BotCand.Reason) {
2713    IsTopNode = true;
2714    tracePick(TopCand, IsTopNode);
2715    return TopCand.SU;
2716  }
2717  // Otherwise prefer the bottom candidate, in node order if all else failed.
2718  IsTopNode = false;
2719  tracePick(BotCand, IsTopNode);
2720  return BotCand.SU;
2721}
2722
2723/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
2724SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
2725  if (DAG->top() == DAG->bottom()) {
2726    assert(Top.Available.empty() && Top.Pending.empty() &&
2727           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
2728    return NULL;
2729  }
2730  SUnit *SU;
2731  do {
2732    if (RegionPolicy.OnlyTopDown) {
2733      SU = Top.pickOnlyChoice();
2734      if (!SU) {
2735        CandPolicy NoPolicy;
2736        SchedCandidate TopCand(NoPolicy);
2737        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
2738        assert(TopCand.Reason != NoCand && "failed to find a candidate");
2739        tracePick(TopCand, true);
2740        SU = TopCand.SU;
2741      }
2742      IsTopNode = true;
2743    }
2744    else if (RegionPolicy.OnlyBottomUp) {
2745      SU = Bot.pickOnlyChoice();
2746      if (!SU) {
2747        CandPolicy NoPolicy;
2748        SchedCandidate BotCand(NoPolicy);
2749        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
2750        assert(BotCand.Reason != NoCand && "failed to find a candidate");
2751        tracePick(BotCand, false);
2752        SU = BotCand.SU;
2753      }
2754      IsTopNode = false;
2755    }
2756    else {
2757      SU = pickNodeBidirectional(IsTopNode);
2758    }
2759  } while (SU->isScheduled);
2760
2761  if (SU->isTopReady())
2762    Top.removeReady(SU);
2763  if (SU->isBottomReady())
2764    Bot.removeReady(SU);
2765
2766  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
2767  return SU;
2768}
2769
2770void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
2771
2772  MachineBasicBlock::iterator InsertPos = SU->getInstr();
2773  if (!isTop)
2774    ++InsertPos;
2775  SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
2776
2777  // Find already scheduled copies with a single physreg dependence and move
2778  // them just above the scheduled instruction.
2779  for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end();
2780       I != E; ++I) {
2781    if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg()))
2782      continue;
2783    SUnit *DepSU = I->getSUnit();
2784    if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
2785      continue;
2786    MachineInstr *Copy = DepSU->getInstr();
2787    if (!Copy->isCopy())
2788      continue;
2789    DEBUG(dbgs() << "  Rescheduling physreg copy ";
2790          I->getSUnit()->dump(DAG));
2791    DAG->moveInstruction(Copy, InsertPos);
2792  }
2793}
2794
2795/// Update the scheduler's state after scheduling a node. This is the same node
2796/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
2797/// it's state based on the current cycle before MachineSchedStrategy does.
2798///
2799/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
2800/// them here. See comments in biasPhysRegCopy.
2801void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
2802  if (IsTopNode) {
2803    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle);
2804    Top.bumpNode(SU);
2805    if (SU->hasPhysRegUses)
2806      reschedulePhysRegCopies(SU, true);
2807  }
2808  else {
2809    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle);
2810    Bot.bumpNode(SU);
2811    if (SU->hasPhysRegDefs)
2812      reschedulePhysRegCopies(SU, false);
2813  }
2814}
2815
2816/// Create the standard converging machine scheduler. This will be used as the
2817/// default scheduler if the target does not set a default.
2818static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
2819  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler(C));
2820  // Register DAG post-processors.
2821  //
2822  // FIXME: extend the mutation API to allow earlier mutations to instantiate
2823  // data and pass it to later mutations. Have a single mutation that gathers
2824  // the interesting nodes in one pass.
2825  DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
2826  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
2827    DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
2828  if (EnableMacroFusion)
2829    DAG->addMutation(new MacroFusion(DAG->TII));
2830  return DAG;
2831}
2832static MachineSchedRegistry
2833ConvergingSchedRegistry("converge", "Standard converging scheduler.",
2834                        createConvergingSched);
2835
2836//===----------------------------------------------------------------------===//
2837// ILP Scheduler. Currently for experimental analysis of heuristics.
2838//===----------------------------------------------------------------------===//
2839
2840namespace {
2841/// \brief Order nodes by the ILP metric.
2842struct ILPOrder {
2843  const SchedDFSResult *DFSResult;
2844  const BitVector *ScheduledTrees;
2845  bool MaximizeILP;
2846
2847  ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {}
2848
2849  /// \brief Apply a less-than relation on node priority.
2850  ///
2851  /// (Return true if A comes after B in the Q.)
2852  bool operator()(const SUnit *A, const SUnit *B) const {
2853    unsigned SchedTreeA = DFSResult->getSubtreeID(A);
2854    unsigned SchedTreeB = DFSResult->getSubtreeID(B);
2855    if (SchedTreeA != SchedTreeB) {
2856      // Unscheduled trees have lower priority.
2857      if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
2858        return ScheduledTrees->test(SchedTreeB);
2859
2860      // Trees with shallower connections have have lower priority.
2861      if (DFSResult->getSubtreeLevel(SchedTreeA)
2862          != DFSResult->getSubtreeLevel(SchedTreeB)) {
2863        return DFSResult->getSubtreeLevel(SchedTreeA)
2864          < DFSResult->getSubtreeLevel(SchedTreeB);
2865      }
2866    }
2867    if (MaximizeILP)
2868      return DFSResult->getILP(A) < DFSResult->getILP(B);
2869    else
2870      return DFSResult->getILP(A) > DFSResult->getILP(B);
2871  }
2872};
2873
2874/// \brief Schedule based on the ILP metric.
2875class ILPScheduler : public MachineSchedStrategy {
2876  ScheduleDAGMI *DAG;
2877  ILPOrder Cmp;
2878
2879  std::vector<SUnit*> ReadyQ;
2880public:
2881  ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {}
2882
2883  virtual void initialize(ScheduleDAGMI *dag) {
2884    DAG = dag;
2885    DAG->computeDFSResult();
2886    Cmp.DFSResult = DAG->getDFSResult();
2887    Cmp.ScheduledTrees = &DAG->getScheduledTrees();
2888    ReadyQ.clear();
2889  }
2890
2891  virtual void registerRoots() {
2892    // Restore the heap in ReadyQ with the updated DFS results.
2893    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
2894  }
2895
2896  /// Implement MachineSchedStrategy interface.
2897  /// -----------------------------------------
2898
2899  /// Callback to select the highest priority node from the ready Q.
2900  virtual SUnit *pickNode(bool &IsTopNode) {
2901    if (ReadyQ.empty()) return NULL;
2902    std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
2903    SUnit *SU = ReadyQ.back();
2904    ReadyQ.pop_back();
2905    IsTopNode = false;
2906    DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") "
2907          << " ILP: " << DAG->getDFSResult()->getILP(SU)
2908          << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
2909          << DAG->getDFSResult()->getSubtreeLevel(
2910            DAG->getDFSResult()->getSubtreeID(SU)) << '\n'
2911          << "Scheduling " << *SU->getInstr());
2912    return SU;
2913  }
2914
2915  /// \brief Scheduler callback to notify that a new subtree is scheduled.
2916  virtual void scheduleTree(unsigned SubtreeID) {
2917    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
2918  }
2919
2920  /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
2921  /// DFSResults, and resort the priority Q.
2922  virtual void schedNode(SUnit *SU, bool IsTopNode) {
2923    assert(!IsTopNode && "SchedDFSResult needs bottom-up");
2924  }
2925
2926  virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ }
2927
2928  virtual void releaseBottomNode(SUnit *SU) {
2929    ReadyQ.push_back(SU);
2930    std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
2931  }
2932};
2933} // namespace
2934
2935static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
2936  return new ScheduleDAGMI(C, new ILPScheduler(true));
2937}
2938static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
2939  return new ScheduleDAGMI(C, new ILPScheduler(false));
2940}
2941static MachineSchedRegistry ILPMaxRegistry(
2942  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
2943static MachineSchedRegistry ILPMinRegistry(
2944  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
2945
2946//===----------------------------------------------------------------------===//
2947// Machine Instruction Shuffler for Correctness Testing
2948//===----------------------------------------------------------------------===//
2949
2950#ifndef NDEBUG
2951namespace {
2952/// Apply a less-than relation on the node order, which corresponds to the
2953/// instruction order prior to scheduling. IsReverse implements greater-than.
2954template<bool IsReverse>
2955struct SUnitOrder {
2956  bool operator()(SUnit *A, SUnit *B) const {
2957    if (IsReverse)
2958      return A->NodeNum > B->NodeNum;
2959    else
2960      return A->NodeNum < B->NodeNum;
2961  }
2962};
2963
2964/// Reorder instructions as much as possible.
2965class InstructionShuffler : public MachineSchedStrategy {
2966  bool IsAlternating;
2967  bool IsTopDown;
2968
2969  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
2970  // gives nodes with a higher number higher priority causing the latest
2971  // instructions to be scheduled first.
2972  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
2973    TopQ;
2974  // When scheduling bottom-up, use greater-than as the queue priority.
2975  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
2976    BottomQ;
2977public:
2978  InstructionShuffler(bool alternate, bool topdown)
2979    : IsAlternating(alternate), IsTopDown(topdown) {}
2980
2981  virtual void initialize(ScheduleDAGMI *) {
2982    TopQ.clear();
2983    BottomQ.clear();
2984  }
2985
2986  /// Implement MachineSchedStrategy interface.
2987  /// -----------------------------------------
2988
2989  virtual SUnit *pickNode(bool &IsTopNode) {
2990    SUnit *SU;
2991    if (IsTopDown) {
2992      do {
2993        if (TopQ.empty()) return NULL;
2994        SU = TopQ.top();
2995        TopQ.pop();
2996      } while (SU->isScheduled);
2997      IsTopNode = true;
2998    }
2999    else {
3000      do {
3001        if (BottomQ.empty()) return NULL;
3002        SU = BottomQ.top();
3003        BottomQ.pop();
3004      } while (SU->isScheduled);
3005      IsTopNode = false;
3006    }
3007    if (IsAlternating)
3008      IsTopDown = !IsTopDown;
3009    return SU;
3010  }
3011
3012  virtual void schedNode(SUnit *SU, bool IsTopNode) {}
3013
3014  virtual void releaseTopNode(SUnit *SU) {
3015    TopQ.push(SU);
3016  }
3017  virtual void releaseBottomNode(SUnit *SU) {
3018    BottomQ.push(SU);
3019  }
3020};
3021} // namespace
3022
3023static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
3024  bool Alternate = !ForceTopDown && !ForceBottomUp;
3025  bool TopDown = !ForceBottomUp;
3026  assert((TopDown || !ForceTopDown) &&
3027         "-misched-topdown incompatible with -misched-bottomup");
3028  return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown));
3029}
3030static MachineSchedRegistry ShufflerRegistry(
3031  "shuffle", "Shuffle machine instructions alternating directions",
3032  createInstructionShuffler);
3033#endif // !NDEBUG
3034
3035//===----------------------------------------------------------------------===//
3036// GraphWriter support for ScheduleDAGMI.
3037//===----------------------------------------------------------------------===//
3038
3039#ifndef NDEBUG
3040namespace llvm {
3041
3042template<> struct GraphTraits<
3043  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
3044
3045template<>
3046struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
3047
3048  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
3049
3050  static std::string getGraphName(const ScheduleDAG *G) {
3051    return G->MF.getName();
3052  }
3053
3054  static bool renderGraphFromBottomUp() {
3055    return true;
3056  }
3057
3058  static bool isNodeHidden(const SUnit *Node) {
3059    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
3060  }
3061
3062  static bool hasNodeAddressLabel(const SUnit *Node,
3063                                  const ScheduleDAG *Graph) {
3064    return false;
3065  }
3066
3067  /// If you want to override the dot attributes printed for a particular
3068  /// edge, override this method.
3069  static std::string getEdgeAttributes(const SUnit *Node,
3070                                       SUnitIterator EI,
3071                                       const ScheduleDAG *Graph) {
3072    if (EI.isArtificialDep())
3073      return "color=cyan,style=dashed";
3074    if (EI.isCtrlDep())
3075      return "color=blue,style=dashed";
3076    return "";
3077  }
3078
3079  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
3080    std::string Str;
3081    raw_string_ostream SS(Str);
3082    const SchedDFSResult *DFS =
3083      static_cast<const ScheduleDAGMI*>(G)->getDFSResult();
3084    SS << "SU:" << SU->NodeNum;
3085    if (DFS)
3086      SS << " I:" << DFS->getNumInstrs(SU);
3087    return SS.str();
3088  }
3089  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
3090    return G->getGraphNodeLabel(SU);
3091  }
3092
3093  static std::string getNodeAttributes(const SUnit *N,
3094                                       const ScheduleDAG *Graph) {
3095    std::string Str("shape=Mrecord");
3096    const SchedDFSResult *DFS =
3097      static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult();
3098    if (DFS) {
3099      Str += ",style=filled,fillcolor=\"#";
3100      Str += DOT::getColorString(DFS->getSubtreeID(N));
3101      Str += '"';
3102    }
3103    return Str;
3104  }
3105};
3106} // namespace llvm
3107#endif // NDEBUG
3108
3109/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
3110/// rendered using 'dot'.
3111///
3112void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
3113#ifndef NDEBUG
3114  ViewGraph(this, Name, false, Title);
3115#else
3116  errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
3117         << "systems with Graphviz or gv!\n";
3118#endif  // NDEBUG
3119}
3120
3121/// Out-of-line implementation with no arguments is handy for gdb.
3122void ScheduleDAGMI::viewGraph() {
3123  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
3124}
3125