1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief SI Machine Scheduler interface
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "SIMachineScheduler.h"
17#include "llvm/CodeGen/LiveInterval.h"
18#include "llvm/CodeGen/LiveIntervalAnalysis.h"
19#include "llvm/CodeGen/MachineRegisterInfo.h"
20#include "llvm/CodeGen/MachineScheduler.h"
21#include "llvm/CodeGen/RegisterPressure.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "misched"
26
27// This scheduler implements a different scheduling algorithm than
28// GenericScheduler.
29//
30// There are several specific architecture behaviours that can't be modelled
31// for GenericScheduler:
32// . When accessing the result of an SGPR load instruction, you have to wait
33// for all the SGPR load instructions before your current instruction to
34// have finished.
35// . When accessing the result of an VGPR load instruction, you have to wait
36// for all the VGPR load instructions previous to the VGPR load instruction
37// you are interested in to finish.
38// . The less the register pressure, the best load latencies are hidden
39//
40// Moreover some specifities (like the fact a lot of instructions in the shader
41// have few dependencies) makes the generic scheduler have some unpredictable
42// behaviours. For example when register pressure becomes high, it can either
43// manage to prevent register pressure from going too high, or it can
44// increase register pressure even more than if it hadn't taken register
45// pressure into account.
46//
47// Also some other bad behaviours are generated, like loading at the beginning
48// of the shader a constant in VGPR you won't need until the end of the shader.
49//
50// The scheduling problem for SI can distinguish three main parts:
51// . Hiding high latencies (texture sampling, etc)
52// . Hiding low latencies (SGPR constant loading, etc)
53// . Keeping register usage low for better latency hiding and general
54//   performance
55//
56// Some other things can also affect performance, but are hard to predict
57// (cache usage, the fact the HW can issue several instructions from different
58// wavefronts if different types, etc)
59//
60// This scheduler tries to solve the scheduling problem by dividing it into
61// simpler sub-problems. It divides the instructions into blocks, schedules
62// locally inside the blocks where it takes care of low latencies, and then
63// chooses the order of the blocks by taking care of high latencies.
64// Dividing the instructions into blocks helps control keeping register
65// usage low.
66//
67// First the instructions are put into blocks.
68//   We want the blocks help control register usage and hide high latencies
69//   later. To help control register usage, we typically want all local
70//   computations, when for example you create a result that can be comsummed
71//   right away, to be contained in a block. Block inputs and outputs would
72//   typically be important results that are needed in several locations of
73//   the shader. Since we do want blocks to help hide high latencies, we want
74//   the instructions inside the block to have a minimal set of dependencies
75//   on high latencies. It will make it easy to pick blocks to hide specific
76//   high latencies.
77//   The block creation algorithm is divided into several steps, and several
78//   variants can be tried during the scheduling process.
79//
80// Second the order of the instructions inside the blocks is choosen.
81//   At that step we do take into account only register usage and hiding
82//   low latency instructions
83//
84// Third the block order is choosen, there we try to hide high latencies
85// and keep register usage low.
86//
87// After the third step, a pass is done to improve the hiding of low
88// latencies.
89//
90// Actually when talking about 'low latency' or 'high latency' it includes
91// both the latency to get the cache (or global mem) data go to the register,
92// and the bandwith limitations.
93// Increasing the number of active wavefronts helps hide the former, but it
94// doesn't solve the latter, thus why even if wavefront count is high, we have
95// to try have as many instructions hiding high latencies as possible.
96// The OpenCL doc says for example latency of 400 cycles for a global mem access,
97// which is hidden by 10 instructions if the wavefront count is 10.
98
99// Some figures taken from AMD docs:
100// Both texture and constant L1 caches are 4-way associative with 64 bytes
101// lines.
102// Constant cache is shared with 4 CUs.
103// For texture sampling, the address generation unit receives 4 texture
104// addresses per cycle, thus we could expect texture sampling latency to be
105// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
106// instructions in a wavefront group are executed every 4 cycles),
107// or 16 instructions if the other wavefronts associated to the 3 other VALUs
108// of the CU do texture sampling too. (Don't take these figures too seriously,
109// as I'm not 100% sure of the computation)
110// Data exports should get similar latency.
111// For constant loading, the cache is shader with 4 CUs.
112// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
113// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
114// It means a simple s_buffer_load should take one instruction to hide, as
115// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
116// cache line.
117//
118// As of today the driver doesn't preload the constants in cache, thus the
119// first loads get extra latency. The doc says global memory access can be
120// 300-600 cycles. We do not specially take that into account when scheduling
121// As we expect the driver to be able to preload the constants soon.
122
123
124// common code //
125
126#ifndef NDEBUG
127
128static const char *getReasonStr(SIScheduleCandReason Reason) {
129  switch (Reason) {
130  case NoCand:         return "NOCAND";
131  case RegUsage:       return "REGUSAGE";
132  case Latency:        return "LATENCY";
133  case Successor:      return "SUCCESSOR";
134  case Depth:          return "DEPTH";
135  case NodeOrder:      return "ORDER";
136  }
137  llvm_unreachable("Unknown reason!");
138}
139
140#endif
141
142static bool tryLess(int TryVal, int CandVal,
143                    SISchedulerCandidate &TryCand,
144                    SISchedulerCandidate &Cand,
145                    SIScheduleCandReason Reason) {
146  if (TryVal < CandVal) {
147    TryCand.Reason = Reason;
148    return true;
149  }
150  if (TryVal > CandVal) {
151    if (Cand.Reason > Reason)
152      Cand.Reason = Reason;
153    return true;
154  }
155  Cand.setRepeat(Reason);
156  return false;
157}
158
159static bool tryGreater(int TryVal, int CandVal,
160                       SISchedulerCandidate &TryCand,
161                       SISchedulerCandidate &Cand,
162                       SIScheduleCandReason Reason) {
163  if (TryVal > CandVal) {
164    TryCand.Reason = Reason;
165    return true;
166  }
167  if (TryVal < CandVal) {
168    if (Cand.Reason > Reason)
169      Cand.Reason = Reason;
170    return true;
171  }
172  Cand.setRepeat(Reason);
173  return false;
174}
175
176// SIScheduleBlock //
177
178void SIScheduleBlock::addUnit(SUnit *SU) {
179  NodeNum2Index[SU->NodeNum] = SUnits.size();
180  SUnits.push_back(SU);
181}
182
183#ifndef NDEBUG
184
185void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
186
187  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
188  dbgs() << '\n';
189}
190#endif
191
192void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
193                                          SISchedCandidate &TryCand) {
194  // Initialize the candidate if needed.
195  if (!Cand.isValid()) {
196    TryCand.Reason = NodeOrder;
197    return;
198  }
199
200  if (Cand.SGPRUsage > 60 &&
201      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
202    return;
203
204  // Schedule low latency instructions as top as possible.
205  // Order of priority is:
206  // . Low latency instructions which do not depend on other low latency
207  //   instructions we haven't waited for
208  // . Other instructions which do not depend on low latency instructions
209  //   we haven't waited for
210  // . Low latencies
211  // . All other instructions
212  // Goal is to get: low latency instructions - independant instructions
213  //     - (eventually some more low latency instructions)
214  //     - instructions that depend on the first low latency instructions.
215  // If in the block there is a lot of constant loads, the SGPR usage
216  // could go quite high, thus above the arbitrary limit of 60 will encourage
217  // use the already loaded constants (in order to release some SGPRs) before
218  // loading more.
219  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
220              Cand.HasLowLatencyNonWaitedParent,
221              TryCand, Cand, SIScheduleCandReason::Depth))
222    return;
223
224  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225                 TryCand, Cand, SIScheduleCandReason::Depth))
226    return;
227
228  if (TryCand.IsLowLatency &&
229      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230              TryCand, Cand, SIScheduleCandReason::Depth))
231    return;
232
233  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
234    return;
235
236  // Fall through to original instruction order.
237  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
238    TryCand.Reason = NodeOrder;
239  }
240}
241
242SUnit* SIScheduleBlock::pickNode() {
243  SISchedCandidate TopCand;
244
245  for (SUnit* SU : TopReadySUs) {
246    SISchedCandidate TryCand;
247    std::vector<unsigned> pressure;
248    std::vector<unsigned> MaxPressure;
249    // Predict register usage after this instruction.
250    TryCand.SU = SU;
251    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
252    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
253    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
254    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
255    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
256    TryCand.HasLowLatencyNonWaitedParent =
257      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
258    tryCandidateTopDown(TopCand, TryCand);
259    if (TryCand.Reason != NoCand)
260      TopCand.setBest(TryCand);
261  }
262
263  return TopCand.SU;
264}
265
266
267// Schedule something valid.
268void SIScheduleBlock::fastSchedule() {
269  TopReadySUs.clear();
270  if (Scheduled)
271    undoSchedule();
272
273  for (SUnit* SU : SUnits) {
274    if (!SU->NumPredsLeft)
275      TopReadySUs.push_back(SU);
276  }
277
278  while (!TopReadySUs.empty()) {
279    SUnit *SU = TopReadySUs[0];
280    ScheduledSUnits.push_back(SU);
281    nodeScheduled(SU);
282  }
283
284  Scheduled = true;
285}
286
287// Returns if the register was set between first and last.
288static bool isDefBetween(unsigned Reg,
289                           SlotIndex First, SlotIndex Last,
290                           const MachineRegisterInfo *MRI,
291                           const LiveIntervals *LIS) {
292  for (MachineRegisterInfo::def_instr_iterator
293       UI = MRI->def_instr_begin(Reg),
294       UE = MRI->def_instr_end(); UI != UE; ++UI) {
295    const MachineInstr* MI = &*UI;
296    if (MI->isDebugValue())
297      continue;
298    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
299    if (InstSlot >= First && InstSlot <= Last)
300      return true;
301  }
302  return false;
303}
304
305void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
306                                      MachineBasicBlock::iterator EndBlock) {
307  IntervalPressure Pressure, BotPressure;
308  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
309  LiveIntervals *LIS = DAG->getLIS();
310  MachineRegisterInfo *MRI = DAG->getMRI();
311  DAG->initRPTracker(TopRPTracker);
312  DAG->initRPTracker(BotRPTracker);
313  DAG->initRPTracker(RPTracker);
314
315  // Goes though all SU. RPTracker captures what had to be alive for the SUs
316  // to execute, and what is still alive at the end.
317  for (SUnit* SU : ScheduledSUnits) {
318    RPTracker.setPos(SU->getInstr());
319    RPTracker.advance();
320  }
321
322  // Close the RPTracker to finalize live ins/outs.
323  RPTracker.closeRegion();
324
325  // Initialize the live ins and live outs.
326  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
327  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
328
329  // Do not Track Physical Registers, because it messes up.
330  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
331    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
332      LiveInRegs.insert(RegMaskPair.RegUnit);
333  }
334  LiveOutRegs.clear();
335  // There is several possibilities to distinguish:
336  // 1) Reg is not input to any instruction in the block, but is output of one
337  // 2) 1) + read in the block and not needed after it
338  // 3) 1) + read in the block but needed in another block
339  // 4) Reg is input of an instruction but another block will read it too
340  // 5) Reg is input of an instruction and then rewritten in the block.
341  //    result is not read in the block (implies used in another block)
342  // 6) Reg is input of an instruction and then rewritten in the block.
343  //    result is read in the block and not needed in another block
344  // 7) Reg is input of an instruction and then rewritten in the block.
345  //    result is read in the block but also needed in another block
346  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
347  // We want LiveOutRegs to contain only Regs whose content will be read after
348  // in another block, and whose content was written in the current block,
349  // that is we want it to get 1, 3, 5, 7
350  // Since we made the MIs of a block to be packed all together before
351  // scheduling, then the LiveIntervals were correct, and the RPTracker was
352  // able to correctly handle 5 vs 6, 2 vs 3.
353  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
354  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
355  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
356  // The use of findDefBetween removes the case 4.
357  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
358    unsigned Reg = RegMaskPair.RegUnit;
359    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
360        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
361                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
362                     LIS)) {
363      LiveOutRegs.insert(Reg);
364    }
365  }
366
367  // Pressure = sum_alive_registers register size
368  // Internally llvm will represent some registers as big 128 bits registers
369  // for example, but they actually correspond to 4 actual 32 bits registers.
370  // Thus Pressure is not equal to num_alive_registers * constant.
371  LiveInPressure = TopPressure.MaxSetPressure;
372  LiveOutPressure = BotPressure.MaxSetPressure;
373
374  // Prepares TopRPTracker for top down scheduling.
375  TopRPTracker.closeTop();
376}
377
378void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
379                               MachineBasicBlock::iterator EndBlock) {
380  if (!Scheduled)
381    fastSchedule();
382
383  // PreScheduling phase to set LiveIn and LiveOut.
384  initRegPressure(BeginBlock, EndBlock);
385  undoSchedule();
386
387  // Schedule for real now.
388
389  TopReadySUs.clear();
390
391  for (SUnit* SU : SUnits) {
392    if (!SU->NumPredsLeft)
393      TopReadySUs.push_back(SU);
394  }
395
396  while (!TopReadySUs.empty()) {
397    SUnit *SU = pickNode();
398    ScheduledSUnits.push_back(SU);
399    TopRPTracker.setPos(SU->getInstr());
400    TopRPTracker.advance();
401    nodeScheduled(SU);
402  }
403
404  // TODO: compute InternalAdditionnalPressure.
405  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
406
407  // Check everything is right.
408#ifndef NDEBUG
409  assert(SUnits.size() == ScheduledSUnits.size() &&
410            TopReadySUs.empty());
411  for (SUnit* SU : SUnits) {
412    assert(SU->isScheduled &&
413              SU->NumPredsLeft == 0);
414  }
415#endif
416
417  Scheduled = true;
418}
419
420void SIScheduleBlock::undoSchedule() {
421  for (SUnit* SU : SUnits) {
422    SU->isScheduled = false;
423    for (SDep& Succ : SU->Succs) {
424      if (BC->isSUInBlock(Succ.getSUnit(), ID))
425        undoReleaseSucc(SU, &Succ);
426    }
427  }
428  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
429  ScheduledSUnits.clear();
430  Scheduled = false;
431}
432
433void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
434  SUnit *SuccSU = SuccEdge->getSUnit();
435
436  if (SuccEdge->isWeak()) {
437    ++SuccSU->WeakPredsLeft;
438    return;
439  }
440  ++SuccSU->NumPredsLeft;
441}
442
443void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
444  SUnit *SuccSU = SuccEdge->getSUnit();
445
446  if (SuccEdge->isWeak()) {
447    --SuccSU->WeakPredsLeft;
448    return;
449  }
450#ifndef NDEBUG
451  if (SuccSU->NumPredsLeft == 0) {
452    dbgs() << "*** Scheduling failed! ***\n";
453    SuccSU->dump(DAG);
454    dbgs() << " has been released too many times!\n";
455    llvm_unreachable(nullptr);
456  }
457#endif
458
459  --SuccSU->NumPredsLeft;
460}
461
462/// Release Successors of the SU that are in the block or not.
463void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
464  for (SDep& Succ : SU->Succs) {
465    SUnit *SuccSU = Succ.getSUnit();
466
467    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
468      continue;
469
470    releaseSucc(SU, &Succ);
471    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
472      TopReadySUs.push_back(SuccSU);
473  }
474}
475
476void SIScheduleBlock::nodeScheduled(SUnit *SU) {
477  // Is in TopReadySUs
478  assert (!SU->NumPredsLeft);
479  std::vector<SUnit*>::iterator I =
480    std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
481  if (I == TopReadySUs.end()) {
482    dbgs() << "Data Structure Bug in SI Scheduler\n";
483    llvm_unreachable(nullptr);
484  }
485  TopReadySUs.erase(I);
486
487  releaseSuccessors(SU, true);
488  // Scheduling this node will trigger a wait,
489  // thus propagate to other instructions that they do not need to wait either.
490  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
491    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
492
493  if (DAG->IsLowLatencySU[SU->NodeNum]) {
494     for (SDep& Succ : SU->Succs) {
495      std::map<unsigned, unsigned>::iterator I =
496        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
497      if (I != NodeNum2Index.end())
498        HasLowLatencyNonWaitedParent[I->second] = 1;
499    }
500  }
501  SU->isScheduled = true;
502}
503
504void SIScheduleBlock::finalizeUnits() {
505  // We remove links from outside blocks to enable scheduling inside the block.
506  for (SUnit* SU : SUnits) {
507    releaseSuccessors(SU, false);
508    if (DAG->IsHighLatencySU[SU->NodeNum])
509      HighLatencyBlock = true;
510  }
511  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
512}
513
514// we maintain ascending order of IDs
515void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
516  unsigned PredID = Pred->getID();
517
518  // Check if not already predecessor.
519  for (SIScheduleBlock* P : Preds) {
520    if (PredID == P->getID())
521      return;
522  }
523  Preds.push_back(Pred);
524
525  assert(none_of(Succs,
526                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
527         "Loop in the Block Graph!");
528}
529
530void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
531  unsigned SuccID = Succ->getID();
532
533  // Check if not already predecessor.
534  for (SIScheduleBlock* S : Succs) {
535    if (SuccID == S->getID())
536      return;
537  }
538  if (Succ->isHighLatencyBlock())
539    ++NumHighLatencySuccessors;
540  Succs.push_back(Succ);
541  assert(none_of(Preds,
542                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
543         "Loop in the Block Graph!");
544}
545
546#ifndef NDEBUG
547void SIScheduleBlock::printDebug(bool full) {
548  dbgs() << "Block (" << ID << ")\n";
549  if (!full)
550    return;
551
552  dbgs() << "\nContains High Latency Instruction: "
553         << HighLatencyBlock << '\n';
554  dbgs() << "\nDepends On:\n";
555  for (SIScheduleBlock* P : Preds) {
556    P->printDebug(false);
557  }
558
559  dbgs() << "\nSuccessors:\n";
560  for (SIScheduleBlock* S : Succs) {
561    S->printDebug(false);
562  }
563
564  if (Scheduled) {
565    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
566           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
567    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
568           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
569    dbgs() << "LiveIns:\n";
570    for (unsigned Reg : LiveInRegs)
571      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
572
573    dbgs() << "\nLiveOuts:\n";
574    for (unsigned Reg : LiveOutRegs)
575      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
576  }
577
578  dbgs() << "\nInstructions:\n";
579  if (!Scheduled) {
580    for (SUnit* SU : SUnits) {
581      SU->dump(DAG);
582    }
583  } else {
584    for (SUnit* SU : SUnits) {
585      SU->dump(DAG);
586    }
587  }
588
589   dbgs() << "///////////////////////\n";
590}
591
592#endif
593
594// SIScheduleBlockCreator //
595
596SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
597DAG(DAG) {
598}
599
600SIScheduleBlockCreator::~SIScheduleBlockCreator() {
601}
602
603SIScheduleBlocks
604SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
605  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
606    Blocks.find(BlockVariant);
607  if (B == Blocks.end()) {
608    SIScheduleBlocks Res;
609    createBlocksForVariant(BlockVariant);
610    topologicalSort();
611    scheduleInsideBlocks();
612    fillStats();
613    Res.Blocks = CurrentBlocks;
614    Res.TopDownIndex2Block = TopDownIndex2Block;
615    Res.TopDownBlock2Index = TopDownBlock2Index;
616    Blocks[BlockVariant] = Res;
617    return Res;
618  } else {
619    return B->second;
620  }
621}
622
623bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
624  if (SU->NodeNum >= DAG->SUnits.size())
625    return false;
626  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
627}
628
629void SIScheduleBlockCreator::colorHighLatenciesAlone() {
630  unsigned DAGSize = DAG->SUnits.size();
631
632  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
633    SUnit *SU = &DAG->SUnits[i];
634    if (DAG->IsHighLatencySU[SU->NodeNum]) {
635      CurrentColoring[SU->NodeNum] = NextReservedID++;
636    }
637  }
638}
639
640void SIScheduleBlockCreator::colorHighLatenciesGroups() {
641  unsigned DAGSize = DAG->SUnits.size();
642  unsigned NumHighLatencies = 0;
643  unsigned GroupSize;
644  unsigned Color = NextReservedID;
645  unsigned Count = 0;
646  std::set<unsigned> FormingGroup;
647
648  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
649    SUnit *SU = &DAG->SUnits[i];
650    if (DAG->IsHighLatencySU[SU->NodeNum])
651      ++NumHighLatencies;
652  }
653
654  if (NumHighLatencies == 0)
655    return;
656
657  if (NumHighLatencies <= 6)
658    GroupSize = 2;
659  else if (NumHighLatencies <= 12)
660    GroupSize = 3;
661  else
662    GroupSize = 4;
663
664  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
665    SUnit *SU = &DAG->SUnits[i];
666    if (DAG->IsHighLatencySU[SU->NodeNum]) {
667      unsigned CompatibleGroup = true;
668      unsigned ProposedColor = Color;
669      for (unsigned j : FormingGroup) {
670        // TODO: Currently CompatibleGroup will always be false,
671        // because the graph enforces the load order. This
672        // can be fixed, but as keeping the load order is often
673        // good for performance that causes a performance hit (both
674        // the default scheduler and this scheduler).
675        // When this scheduler determines a good load order,
676        // this can be fixed.
677        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
678            !DAG->canAddEdge(&DAG->SUnits[j], SU))
679          CompatibleGroup = false;
680      }
681      if (!CompatibleGroup || ++Count == GroupSize) {
682        FormingGroup.clear();
683        Color = ++NextReservedID;
684        if (!CompatibleGroup) {
685          ProposedColor = Color;
686          FormingGroup.insert(SU->NodeNum);
687        }
688        Count = 0;
689      } else {
690        FormingGroup.insert(SU->NodeNum);
691      }
692      CurrentColoring[SU->NodeNum] = ProposedColor;
693    }
694  }
695}
696
697void SIScheduleBlockCreator::colorComputeReservedDependencies() {
698  unsigned DAGSize = DAG->SUnits.size();
699  std::map<std::set<unsigned>, unsigned> ColorCombinations;
700
701  CurrentTopDownReservedDependencyColoring.clear();
702  CurrentBottomUpReservedDependencyColoring.clear();
703
704  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
705  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
706
707  // Traverse TopDown, and give different colors to SUs depending
708  // on which combination of High Latencies they depend on.
709
710  for (unsigned SUNum : DAG->TopDownIndex2SU) {
711    SUnit *SU = &DAG->SUnits[SUNum];
712    std::set<unsigned> SUColors;
713
714    // Already given.
715    if (CurrentColoring[SU->NodeNum]) {
716      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
717        CurrentColoring[SU->NodeNum];
718      continue;
719    }
720
721   for (SDep& PredDep : SU->Preds) {
722      SUnit *Pred = PredDep.getSUnit();
723      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
724        continue;
725      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
726        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
727    }
728    // Color 0 by default.
729    if (SUColors.empty())
730      continue;
731    // Same color than parents.
732    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
733      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
734        *SUColors.begin();
735    else {
736      std::map<std::set<unsigned>, unsigned>::iterator Pos =
737        ColorCombinations.find(SUColors);
738      if (Pos != ColorCombinations.end()) {
739          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
740      } else {
741        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
742          NextNonReservedID;
743        ColorCombinations[SUColors] = NextNonReservedID++;
744      }
745    }
746  }
747
748  ColorCombinations.clear();
749
750  // Same as before, but BottomUp.
751
752  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
753    SUnit *SU = &DAG->SUnits[SUNum];
754    std::set<unsigned> SUColors;
755
756    // Already given.
757    if (CurrentColoring[SU->NodeNum]) {
758      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
759        CurrentColoring[SU->NodeNum];
760      continue;
761    }
762
763    for (SDep& SuccDep : SU->Succs) {
764      SUnit *Succ = SuccDep.getSUnit();
765      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
766        continue;
767      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
768        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
769    }
770    // Keep color 0.
771    if (SUColors.empty())
772      continue;
773    // Same color than parents.
774    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
775      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
776        *SUColors.begin();
777    else {
778      std::map<std::set<unsigned>, unsigned>::iterator Pos =
779        ColorCombinations.find(SUColors);
780      if (Pos != ColorCombinations.end()) {
781        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
782      } else {
783        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
784          NextNonReservedID;
785        ColorCombinations[SUColors] = NextNonReservedID++;
786      }
787    }
788  }
789}
790
791void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
792  unsigned DAGSize = DAG->SUnits.size();
793  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
794
795  // Every combination of colors given by the top down
796  // and bottom up Reserved node dependency
797
798  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
799    SUnit *SU = &DAG->SUnits[i];
800    std::pair<unsigned, unsigned> SUColors;
801
802    // High latency instructions: already given.
803    if (CurrentColoring[SU->NodeNum])
804      continue;
805
806    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
807    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
808
809    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
810      ColorCombinations.find(SUColors);
811    if (Pos != ColorCombinations.end()) {
812      CurrentColoring[SU->NodeNum] = Pos->second;
813    } else {
814      CurrentColoring[SU->NodeNum] = NextNonReservedID;
815      ColorCombinations[SUColors] = NextNonReservedID++;
816    }
817  }
818}
819
820void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
821  unsigned DAGSize = DAG->SUnits.size();
822  std::vector<int> PendingColoring = CurrentColoring;
823
824  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
825    SUnit *SU = &DAG->SUnits[SUNum];
826    std::set<unsigned> SUColors;
827    std::set<unsigned> SUColorsPending;
828
829    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
830      continue;
831
832    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
833        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
834      continue;
835
836    for (SDep& SuccDep : SU->Succs) {
837      SUnit *Succ = SuccDep.getSUnit();
838      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
839        continue;
840      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
841          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
842        SUColors.insert(CurrentColoring[Succ->NodeNum]);
843      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
844    }
845    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
846      PendingColoring[SU->NodeNum] = *SUColors.begin();
847    else // TODO: Attribute new colors depending on color
848         // combination of children.
849      PendingColoring[SU->NodeNum] = NextNonReservedID++;
850  }
851  CurrentColoring = PendingColoring;
852}
853
854
855void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
856  unsigned DAGSize = DAG->SUnits.size();
857  unsigned PreviousColor;
858  std::set<unsigned> SeenColors;
859
860  if (DAGSize <= 1)
861    return;
862
863  PreviousColor = CurrentColoring[0];
864
865  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
866    SUnit *SU = &DAG->SUnits[i];
867    unsigned CurrentColor = CurrentColoring[i];
868    unsigned PreviousColorSave = PreviousColor;
869    assert(i == SU->NodeNum);
870
871    if (CurrentColor != PreviousColor)
872      SeenColors.insert(PreviousColor);
873    PreviousColor = CurrentColor;
874
875    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
876      continue;
877
878    if (SeenColors.find(CurrentColor) == SeenColors.end())
879      continue;
880
881    if (PreviousColorSave != CurrentColor)
882      CurrentColoring[i] = NextNonReservedID++;
883    else
884      CurrentColoring[i] = CurrentColoring[i-1];
885  }
886}
887
888void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
889  unsigned DAGSize = DAG->SUnits.size();
890
891  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
892    SUnit *SU = &DAG->SUnits[SUNum];
893    std::set<unsigned> SUColors;
894
895    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
896      continue;
897
898    // No predecessor: Vgpr constant loading.
899    // Low latency instructions usually have a predecessor (the address)
900    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
901      continue;
902
903    for (SDep& SuccDep : SU->Succs) {
904      SUnit *Succ = SuccDep.getSUnit();
905      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
906        continue;
907      SUColors.insert(CurrentColoring[Succ->NodeNum]);
908    }
909    if (SUColors.size() == 1)
910      CurrentColoring[SU->NodeNum] = *SUColors.begin();
911  }
912}
913
914void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
915  unsigned DAGSize = DAG->SUnits.size();
916
917  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
918    SUnit *SU = &DAG->SUnits[SUNum];
919    std::set<unsigned> SUColors;
920
921    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
922      continue;
923
924    for (SDep& SuccDep : SU->Succs) {
925       SUnit *Succ = SuccDep.getSUnit();
926      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
927        continue;
928      SUColors.insert(CurrentColoring[Succ->NodeNum]);
929    }
930    if (SUColors.size() == 1)
931      CurrentColoring[SU->NodeNum] = *SUColors.begin();
932  }
933}
934
935void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
936  unsigned DAGSize = DAG->SUnits.size();
937
938  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
939    SUnit *SU = &DAG->SUnits[SUNum];
940    std::set<unsigned> SUColors;
941
942    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
943      continue;
944
945    for (SDep& SuccDep : SU->Succs) {
946       SUnit *Succ = SuccDep.getSUnit();
947      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
948        continue;
949      SUColors.insert(CurrentColoring[Succ->NodeNum]);
950    }
951    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
952      CurrentColoring[SU->NodeNum] = *SUColors.begin();
953  }
954}
955
956void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
957  unsigned DAGSize = DAG->SUnits.size();
958  std::map<unsigned, unsigned> ColorCount;
959
960  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
961    SUnit *SU = &DAG->SUnits[SUNum];
962    unsigned color = CurrentColoring[SU->NodeNum];
963    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
964      if (Pos != ColorCount.end()) {
965        ++ColorCount[color];
966      } else {
967        ColorCount[color] = 1;
968      }
969  }
970
971  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
972    SUnit *SU = &DAG->SUnits[SUNum];
973    unsigned color = CurrentColoring[SU->NodeNum];
974    std::set<unsigned> SUColors;
975
976    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
977      continue;
978
979    if (ColorCount[color] > 1)
980      continue;
981
982    for (SDep& SuccDep : SU->Succs) {
983       SUnit *Succ = SuccDep.getSUnit();
984      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
985        continue;
986      SUColors.insert(CurrentColoring[Succ->NodeNum]);
987    }
988    if (SUColors.size() == 1 && *SUColors.begin() != color) {
989      --ColorCount[color];
990      CurrentColoring[SU->NodeNum] = *SUColors.begin();
991      ++ColorCount[*SUColors.begin()];
992    }
993  }
994}
995
996void SIScheduleBlockCreator::cutHugeBlocks() {
997  // TODO
998}
999
1000void SIScheduleBlockCreator::regroupNoUserInstructions() {
1001  unsigned DAGSize = DAG->SUnits.size();
1002  int GroupID = NextNonReservedID++;
1003
1004  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1005    SUnit *SU = &DAG->SUnits[SUNum];
1006    bool hasSuccessor = false;
1007
1008    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1009      continue;
1010
1011    for (SDep& SuccDep : SU->Succs) {
1012       SUnit *Succ = SuccDep.getSUnit();
1013      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1014        continue;
1015      hasSuccessor = true;
1016    }
1017    if (!hasSuccessor)
1018      CurrentColoring[SU->NodeNum] = GroupID;
1019  }
1020}
1021
1022void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1023  unsigned DAGSize = DAG->SUnits.size();
1024  std::map<unsigned,unsigned> RealID;
1025
1026  CurrentBlocks.clear();
1027  CurrentColoring.clear();
1028  CurrentColoring.resize(DAGSize, 0);
1029  Node2CurrentBlock.clear();
1030
1031  // Restore links previous scheduling variant has overridden.
1032  DAG->restoreSULinksLeft();
1033
1034  NextReservedID = 1;
1035  NextNonReservedID = DAGSize + 1;
1036
1037  DEBUG(dbgs() << "Coloring the graph\n");
1038
1039  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1040    colorHighLatenciesGroups();
1041  else
1042    colorHighLatenciesAlone();
1043  colorComputeReservedDependencies();
1044  colorAccordingToReservedDependencies();
1045  colorEndsAccordingToDependencies();
1046  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1047    colorForceConsecutiveOrderInGroup();
1048  regroupNoUserInstructions();
1049  colorMergeConstantLoadsNextGroup();
1050  colorMergeIfPossibleNextGroupOnlyForReserved();
1051
1052  // Put SUs of same color into same block
1053  Node2CurrentBlock.resize(DAGSize, -1);
1054  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1055    SUnit *SU = &DAG->SUnits[i];
1056    unsigned Color = CurrentColoring[SU->NodeNum];
1057    if (RealID.find(Color) == RealID.end()) {
1058      int ID = CurrentBlocks.size();
1059      BlockPtrs.push_back(
1060        make_unique<SIScheduleBlock>(DAG, this, ID));
1061      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1062      RealID[Color] = ID;
1063    }
1064    CurrentBlocks[RealID[Color]]->addUnit(SU);
1065    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1066  }
1067
1068  // Build dependencies between blocks.
1069  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1070    SUnit *SU = &DAG->SUnits[i];
1071    int SUID = Node2CurrentBlock[i];
1072     for (SDep& SuccDep : SU->Succs) {
1073       SUnit *Succ = SuccDep.getSUnit();
1074      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1075        continue;
1076      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1077        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
1078    }
1079    for (SDep& PredDep : SU->Preds) {
1080      SUnit *Pred = PredDep.getSUnit();
1081      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1082        continue;
1083      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1084        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1085    }
1086  }
1087
1088  // Free root and leafs of all blocks to enable scheduling inside them.
1089  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1090    SIScheduleBlock *Block = CurrentBlocks[i];
1091    Block->finalizeUnits();
1092  }
1093  DEBUG(
1094    dbgs() << "Blocks created:\n\n";
1095    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1096      SIScheduleBlock *Block = CurrentBlocks[i];
1097      Block->printDebug(true);
1098    }
1099  );
1100}
1101
1102// Two functions taken from Codegen/MachineScheduler.cpp
1103
1104/// If this iterator is a debug value, increment until reaching the End or a
1105/// non-debug instruction.
1106static MachineBasicBlock::const_iterator
1107nextIfDebug(MachineBasicBlock::const_iterator I,
1108            MachineBasicBlock::const_iterator End) {
1109  for(; I != End; ++I) {
1110    if (!I->isDebugValue())
1111      break;
1112  }
1113  return I;
1114}
1115
1116/// Non-const version.
1117static MachineBasicBlock::iterator
1118nextIfDebug(MachineBasicBlock::iterator I,
1119            MachineBasicBlock::const_iterator End) {
1120  // Cast the return value to nonconst MachineInstr, then cast to an
1121  // instr_iterator, which does not check for null, finally return a
1122  // bundle_iterator.
1123  return MachineBasicBlock::instr_iterator(
1124    const_cast<MachineInstr*>(
1125      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
1126}
1127
1128void SIScheduleBlockCreator::topologicalSort() {
1129  unsigned DAGSize = CurrentBlocks.size();
1130  std::vector<int> WorkList;
1131
1132  DEBUG(dbgs() << "Topological Sort\n");
1133
1134  WorkList.reserve(DAGSize);
1135  TopDownIndex2Block.resize(DAGSize);
1136  TopDownBlock2Index.resize(DAGSize);
1137  BottomUpIndex2Block.resize(DAGSize);
1138
1139  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1140    SIScheduleBlock *Block = CurrentBlocks[i];
1141    unsigned Degree = Block->getSuccs().size();
1142    TopDownBlock2Index[i] = Degree;
1143    if (Degree == 0) {
1144      WorkList.push_back(i);
1145    }
1146  }
1147
1148  int Id = DAGSize;
1149  while (!WorkList.empty()) {
1150    int i = WorkList.back();
1151    SIScheduleBlock *Block = CurrentBlocks[i];
1152    WorkList.pop_back();
1153    TopDownBlock2Index[i] = --Id;
1154    TopDownIndex2Block[Id] = i;
1155    for (SIScheduleBlock* Pred : Block->getPreds()) {
1156      if (!--TopDownBlock2Index[Pred->getID()])
1157        WorkList.push_back(Pred->getID());
1158    }
1159  }
1160
1161#ifndef NDEBUG
1162  // Check correctness of the ordering.
1163  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1164    SIScheduleBlock *Block = CurrentBlocks[i];
1165    for (SIScheduleBlock* Pred : Block->getPreds()) {
1166      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1167      "Wrong Top Down topological sorting");
1168    }
1169  }
1170#endif
1171
1172  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1173                                         TopDownIndex2Block.rend());
1174}
1175
1176void SIScheduleBlockCreator::scheduleInsideBlocks() {
1177  unsigned DAGSize = CurrentBlocks.size();
1178
1179  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1180
1181  // We do schedule a valid scheduling such that a Block corresponds
1182  // to a range of instructions.
1183  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1184  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1185    SIScheduleBlock *Block = CurrentBlocks[i];
1186    Block->fastSchedule();
1187  }
1188
1189  // Note: the following code, and the part restoring previous position
1190  // is by far the most expensive operation of the Scheduler.
1191
1192  // Do not update CurrentTop.
1193  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1194  std::vector<MachineBasicBlock::iterator> PosOld;
1195  std::vector<MachineBasicBlock::iterator> PosNew;
1196  PosOld.reserve(DAG->SUnits.size());
1197  PosNew.reserve(DAG->SUnits.size());
1198
1199  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1200    int BlockIndice = TopDownIndex2Block[i];
1201    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1202    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1203
1204    for (SUnit* SU : SUs) {
1205      MachineInstr *MI = SU->getInstr();
1206      MachineBasicBlock::iterator Pos = MI;
1207      PosOld.push_back(Pos);
1208      if (&*CurrentTopFastSched == MI) {
1209        PosNew.push_back(Pos);
1210        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1211                                          DAG->getCurrentBottom());
1212      } else {
1213        // Update the instruction stream.
1214        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1215
1216        // Update LiveIntervals.
1217        // Note: Moving all instructions and calling handleMove everytime
1218        // is the most cpu intensive operation of the scheduler.
1219        // It would gain a lot if there was a way to recompute the
1220        // LiveIntervals for the entire scheduling region.
1221        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1222        PosNew.push_back(CurrentTopFastSched);
1223      }
1224    }
1225  }
1226
1227  // Now we have Block of SUs == Block of MI.
1228  // We do the final schedule for the instructions inside the block.
1229  // The property that all the SUs of the Block are grouped together as MI
1230  // is used for correct reg usage tracking.
1231  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1232    SIScheduleBlock *Block = CurrentBlocks[i];
1233    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1234    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1235  }
1236
1237  DEBUG(dbgs() << "Restoring MI Pos\n");
1238  // Restore old ordering (which prevents a LIS->handleMove bug).
1239  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1240    MachineBasicBlock::iterator POld = PosOld[i-1];
1241    MachineBasicBlock::iterator PNew = PosNew[i-1];
1242    if (PNew != POld) {
1243      // Update the instruction stream.
1244      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1245
1246      // Update LiveIntervals.
1247      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1248    }
1249  }
1250
1251  DEBUG(
1252    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1253      SIScheduleBlock *Block = CurrentBlocks[i];
1254      Block->printDebug(true);
1255    }
1256  );
1257}
1258
1259void SIScheduleBlockCreator::fillStats() {
1260  unsigned DAGSize = CurrentBlocks.size();
1261
1262  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1263    int BlockIndice = TopDownIndex2Block[i];
1264    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1265    if (Block->getPreds().size() == 0)
1266      Block->Depth = 0;
1267    else {
1268      unsigned Depth = 0;
1269      for (SIScheduleBlock *Pred : Block->getPreds()) {
1270        if (Depth < Pred->Depth + 1)
1271          Depth = Pred->Depth + 1;
1272      }
1273      Block->Depth = Depth;
1274    }
1275  }
1276
1277  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1278    int BlockIndice = BottomUpIndex2Block[i];
1279    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1280    if (Block->getSuccs().size() == 0)
1281      Block->Height = 0;
1282    else {
1283      unsigned Height = 0;
1284      for (SIScheduleBlock *Succ : Block->getSuccs()) {
1285        if (Height < Succ->Height + 1)
1286          Height = Succ->Height + 1;
1287      }
1288      Block->Height = Height;
1289    }
1290  }
1291}
1292
1293// SIScheduleBlockScheduler //
1294
1295SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1296                                                   SISchedulerBlockSchedulerVariant Variant,
1297                                                   SIScheduleBlocks  BlocksStruct) :
1298  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1299  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1300  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1301
1302  // Fill the usage of every output
1303  // Warning: while by construction we always have a link between two blocks
1304  // when one needs a result from the other, the number of users of an output
1305  // is not the sum of child blocks having as input the same virtual register.
1306  // Here is an example. A produces x and y. B eats x and produces x'.
1307  // C eats x' and y. The register coalescer may have attributed the same
1308  // virtual register to x and x'.
1309  // To count accurately, we do a topological sort. In case the register is
1310  // found for several parents, we increment the usage of the one with the
1311  // highest topological index.
1312  LiveOutRegsNumUsages.resize(Blocks.size());
1313  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1314    SIScheduleBlock *Block = Blocks[i];
1315    for (unsigned Reg : Block->getInRegs()) {
1316      bool Found = false;
1317      int topoInd = -1;
1318      for (SIScheduleBlock* Pred: Block->getPreds()) {
1319        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1320        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1321
1322        if (RegPos != PredOutRegs.end()) {
1323          Found = true;
1324          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1325            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1326          }
1327        }
1328      }
1329
1330      if (!Found)
1331        continue;
1332
1333      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1334      std::map<unsigned, unsigned>::iterator RegPos =
1335        LiveOutRegsNumUsages[PredID].find(Reg);
1336      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
1337        ++LiveOutRegsNumUsages[PredID][Reg];
1338      } else {
1339        LiveOutRegsNumUsages[PredID][Reg] = 1;
1340      }
1341    }
1342  }
1343
1344  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1345  BlockNumPredsLeft.resize(Blocks.size());
1346  BlockNumSuccsLeft.resize(Blocks.size());
1347
1348  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1349    SIScheduleBlock *Block = Blocks[i];
1350    BlockNumPredsLeft[i] = Block->getPreds().size();
1351    BlockNumSuccsLeft[i] = Block->getSuccs().size();
1352  }
1353
1354#ifndef NDEBUG
1355  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1356    SIScheduleBlock *Block = Blocks[i];
1357    assert(Block->getID() == i);
1358  }
1359#endif
1360
1361  std::set<unsigned> InRegs = DAG->getInRegs();
1362  addLiveRegs(InRegs);
1363
1364  // Fill LiveRegsConsumers for regs that were already
1365  // defined before scheduling.
1366  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1367    SIScheduleBlock *Block = Blocks[i];
1368    for (unsigned Reg : Block->getInRegs()) {
1369      bool Found = false;
1370      for (SIScheduleBlock* Pred: Block->getPreds()) {
1371        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1372        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1373
1374        if (RegPos != PredOutRegs.end()) {
1375          Found = true;
1376          break;
1377        }
1378      }
1379
1380      if (!Found) {
1381        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
1382          LiveRegsConsumers[Reg] = 1;
1383        else
1384          ++LiveRegsConsumers[Reg];
1385      }
1386    }
1387  }
1388
1389  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1390    SIScheduleBlock *Block = Blocks[i];
1391    if (BlockNumPredsLeft[i] == 0) {
1392      ReadyBlocks.push_back(Block);
1393    }
1394  }
1395
1396  while (SIScheduleBlock *Block = pickBlock()) {
1397    BlocksScheduled.push_back(Block);
1398    blockScheduled(Block);
1399  }
1400
1401  DEBUG(
1402    dbgs() << "Block Order:";
1403    for (SIScheduleBlock* Block : BlocksScheduled) {
1404      dbgs() << ' ' << Block->getID();
1405    }
1406  );
1407}
1408
1409bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1410                                                   SIBlockSchedCandidate &TryCand) {
1411  if (!Cand.isValid()) {
1412    TryCand.Reason = NodeOrder;
1413    return true;
1414  }
1415
1416  // Try to hide high latencies.
1417  if (tryLess(TryCand.LastPosHighLatParentScheduled,
1418              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1419    return true;
1420  // Schedule high latencies early so you can hide them better.
1421  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1422                 TryCand, Cand, Latency))
1423    return true;
1424  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
1425                                          TryCand, Cand, Depth))
1426    return true;
1427  if (tryGreater(TryCand.NumHighLatencySuccessors,
1428                 Cand.NumHighLatencySuccessors,
1429                 TryCand, Cand, Successor))
1430    return true;
1431  return false;
1432}
1433
1434bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1435                                                    SIBlockSchedCandidate &TryCand) {
1436  if (!Cand.isValid()) {
1437    TryCand.Reason = NodeOrder;
1438    return true;
1439  }
1440
1441  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1442              TryCand, Cand, RegUsage))
1443    return true;
1444  if (tryGreater(TryCand.NumSuccessors > 0,
1445                 Cand.NumSuccessors > 0,
1446                 TryCand, Cand, Successor))
1447    return true;
1448  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1449    return true;
1450  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1451              TryCand, Cand, RegUsage))
1452    return true;
1453  return false;
1454}
1455
1456SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1457  SIBlockSchedCandidate Cand;
1458  std::vector<SIScheduleBlock*>::iterator Best;
1459  SIScheduleBlock *Block;
1460  if (ReadyBlocks.empty())
1461    return nullptr;
1462
1463  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1464                        VregCurrentUsage, SregCurrentUsage);
1465  if (VregCurrentUsage > maxVregUsage)
1466    maxVregUsage = VregCurrentUsage;
1467  if (VregCurrentUsage > maxSregUsage)
1468    maxSregUsage = VregCurrentUsage;
1469  DEBUG(
1470    dbgs() << "Picking New Blocks\n";
1471    dbgs() << "Available: ";
1472    for (SIScheduleBlock* Block : ReadyBlocks)
1473      dbgs() << Block->getID() << ' ';
1474    dbgs() << "\nCurrent Live:\n";
1475    for (unsigned Reg : LiveRegs)
1476      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1477    dbgs() << '\n';
1478    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1479    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1480  );
1481
1482  Cand.Block = nullptr;
1483  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1484       E = ReadyBlocks.end(); I != E; ++I) {
1485    SIBlockSchedCandidate TryCand;
1486    TryCand.Block = *I;
1487    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1488    TryCand.VGPRUsageDiff =
1489      checkRegUsageImpact(TryCand.Block->getInRegs(),
1490                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1491    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1492    TryCand.NumHighLatencySuccessors =
1493      TryCand.Block->getNumHighLatencySuccessors();
1494    TryCand.LastPosHighLatParentScheduled =
1495      (unsigned int) std::max<int> (0,
1496         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1497           LastPosWaitedHighLatency);
1498    TryCand.Height = TryCand.Block->Height;
1499    // Try not to increase VGPR usage too much, else we may spill.
1500    if (VregCurrentUsage > 120 ||
1501        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1502      if (!tryCandidateRegUsage(Cand, TryCand) &&
1503          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1504        tryCandidateLatency(Cand, TryCand);
1505    } else {
1506      if (!tryCandidateLatency(Cand, TryCand))
1507        tryCandidateRegUsage(Cand, TryCand);
1508    }
1509    if (TryCand.Reason != NoCand) {
1510      Cand.setBest(TryCand);
1511      Best = I;
1512      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1513                   << getReasonStr(Cand.Reason) << '\n');
1514    }
1515  }
1516
1517  DEBUG(
1518    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1519    dbgs() << "Is a block with high latency instruction: "
1520      << (Cand.IsHighLatency ? "yes\n" : "no\n");
1521    dbgs() << "Position of last high latency dependency: "
1522           << Cand.LastPosHighLatParentScheduled << '\n';
1523    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1524    dbgs() << '\n';
1525  );
1526
1527  Block = Cand.Block;
1528  ReadyBlocks.erase(Best);
1529  return Block;
1530}
1531
1532// Tracking of currently alive registers to determine VGPR Usage.
1533
1534void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1535  for (unsigned Reg : Regs) {
1536    // For now only track virtual registers.
1537    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1538      continue;
1539    // If not already in the live set, then add it.
1540    (void) LiveRegs.insert(Reg);
1541  }
1542}
1543
1544void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1545                                       std::set<unsigned> &Regs) {
1546  for (unsigned Reg : Regs) {
1547    // For now only track virtual registers.
1548    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1549    assert (Pos != LiveRegs.end() && // Reg must be live.
1550               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1551               LiveRegsConsumers[Reg] >= 1);
1552    --LiveRegsConsumers[Reg];
1553    if (LiveRegsConsumers[Reg] == 0)
1554      LiveRegs.erase(Pos);
1555  }
1556}
1557
1558void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1559  for (SIScheduleBlock* Block : Parent->getSuccs()) {
1560    --BlockNumPredsLeft[Block->getID()];
1561    if (BlockNumPredsLeft[Block->getID()] == 0) {
1562      ReadyBlocks.push_back(Block);
1563    }
1564    // TODO: Improve check. When the dependency between the high latency
1565    // instructions and the instructions of the other blocks are WAR or WAW
1566    // there will be no wait triggered. We would like these cases to not
1567    // update LastPosHighLatencyParentScheduled.
1568    if (Parent->isHighLatencyBlock())
1569      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
1570  }
1571}
1572
1573void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1574  decreaseLiveRegs(Block, Block->getInRegs());
1575  addLiveRegs(Block->getOutRegs());
1576  releaseBlockSuccs(Block);
1577  for (std::map<unsigned, unsigned>::iterator RegI =
1578       LiveOutRegsNumUsages[Block->getID()].begin(),
1579       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1580    std::pair<unsigned, unsigned> RegP = *RegI;
1581    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
1582      LiveRegsConsumers[RegP.first] = RegP.second;
1583    else {
1584      assert(LiveRegsConsumers[RegP.first] == 0);
1585      LiveRegsConsumers[RegP.first] += RegP.second;
1586    }
1587  }
1588  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1589        (unsigned)LastPosWaitedHighLatency)
1590    LastPosWaitedHighLatency =
1591      LastPosHighLatencyParentScheduled[Block->getID()];
1592  ++NumBlockScheduled;
1593}
1594
1595std::vector<int>
1596SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1597                                     std::set<unsigned> &OutRegs) {
1598  std::vector<int> DiffSetPressure;
1599  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1600
1601  for (unsigned Reg : InRegs) {
1602    // For now only track virtual registers.
1603    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1604      continue;
1605    if (LiveRegsConsumers[Reg] > 1)
1606      continue;
1607    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1608    for (; PSetI.isValid(); ++PSetI) {
1609      DiffSetPressure[*PSetI] -= PSetI.getWeight();
1610    }
1611  }
1612
1613  for (unsigned Reg : OutRegs) {
1614    // For now only track virtual registers.
1615    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1616      continue;
1617    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1618    for (; PSetI.isValid(); ++PSetI) {
1619      DiffSetPressure[*PSetI] += PSetI.getWeight();
1620    }
1621  }
1622
1623  return DiffSetPressure;
1624}
1625
1626// SIScheduler //
1627
1628struct SIScheduleBlockResult
1629SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1630                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
1631  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1632  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1633  std::vector<SIScheduleBlock*> ScheduledBlocks;
1634  struct SIScheduleBlockResult Res;
1635
1636  ScheduledBlocks = Scheduler.getBlocks();
1637
1638  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1639    SIScheduleBlock *Block = ScheduledBlocks[b];
1640    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1641
1642    for (SUnit* SU : SUs)
1643      Res.SUs.push_back(SU->NodeNum);
1644  }
1645
1646  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1647  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1648  return Res;
1649}
1650
1651// SIScheduleDAGMI //
1652
1653SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1654  ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
1655  SITII = static_cast<const SIInstrInfo*>(TII);
1656  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1657
1658  VGPRSetID = SITRI->getVGPR32PressureSet();
1659  SGPRSetID = SITRI->getSGPR32PressureSet();
1660}
1661
1662SIScheduleDAGMI::~SIScheduleDAGMI() {
1663}
1664
1665ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
1666  return new SIScheduleDAGMI(C);
1667}
1668
1669// Code adapted from scheduleDAG.cpp
1670// Does a topological sort over the SUs.
1671// Both TopDown and BottomUp
1672void SIScheduleDAGMI::topologicalSort() {
1673  Topo.InitDAGTopologicalSorting();
1674
1675  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1676  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1677}
1678
1679// Move low latencies further from their user without
1680// increasing SGPR usage (in general)
1681// This is to be replaced by a better pass that would
1682// take into account SGPR usage (based on VGPR Usage
1683// and the corresponding wavefront count), that would
1684// try to merge groups of loads if it make sense, etc
1685void SIScheduleDAGMI::moveLowLatencies() {
1686   unsigned DAGSize = SUnits.size();
1687   int LastLowLatencyUser = -1;
1688   int LastLowLatencyPos = -1;
1689
1690   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1691    SUnit *SU = &SUnits[ScheduledSUnits[i]];
1692    bool IsLowLatencyUser = false;
1693    unsigned MinPos = 0;
1694
1695    for (SDep& PredDep : SU->Preds) {
1696      SUnit *Pred = PredDep.getSUnit();
1697      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1698        IsLowLatencyUser = true;
1699      }
1700      if (Pred->NodeNum >= DAGSize)
1701        continue;
1702      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1703      if (PredPos >= MinPos)
1704        MinPos = PredPos + 1;
1705    }
1706
1707    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1708      unsigned BestPos = LastLowLatencyUser + 1;
1709      if ((int)BestPos <= LastLowLatencyPos)
1710        BestPos = LastLowLatencyPos + 1;
1711      if (BestPos < MinPos)
1712        BestPos = MinPos;
1713      if (BestPos < i) {
1714        for (unsigned u = i; u > BestPos; --u) {
1715          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1716          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1717        }
1718        ScheduledSUnits[BestPos] = SU->NodeNum;
1719        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1720      }
1721      LastLowLatencyPos = BestPos;
1722      if (IsLowLatencyUser)
1723        LastLowLatencyUser = BestPos;
1724    } else if (IsLowLatencyUser) {
1725      LastLowLatencyUser = i;
1726    // Moves COPY instructions on which depends
1727    // the low latency instructions too.
1728    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1729      bool CopyForLowLat = false;
1730      for (SDep& SuccDep : SU->Succs) {
1731        SUnit *Succ = SuccDep.getSUnit();
1732        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1733          CopyForLowLat = true;
1734        }
1735      }
1736      if (!CopyForLowLat)
1737        continue;
1738      if (MinPos < i) {
1739        for (unsigned u = i; u > MinPos; --u) {
1740          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1741          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1742        }
1743        ScheduledSUnits[MinPos] = SU->NodeNum;
1744        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1745      }
1746    }
1747  }
1748}
1749
1750void SIScheduleDAGMI::restoreSULinksLeft() {
1751  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1752    SUnits[i].isScheduled = false;
1753    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1754    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1755    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1756    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1757  }
1758}
1759
1760// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1761template<typename _Iterator> void
1762SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1763                                  unsigned &VgprUsage, unsigned &SgprUsage) {
1764  VgprUsage = 0;
1765  SgprUsage = 0;
1766  for (_Iterator RegI = First; RegI != End; ++RegI) {
1767    unsigned Reg = *RegI;
1768    // For now only track virtual registers
1769    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1770      continue;
1771    PSetIterator PSetI = MRI.getPressureSets(Reg);
1772    for (; PSetI.isValid(); ++PSetI) {
1773      if (*PSetI == VGPRSetID)
1774        VgprUsage += PSetI.getWeight();
1775      else if (*PSetI == SGPRSetID)
1776        SgprUsage += PSetI.getWeight();
1777    }
1778  }
1779}
1780
1781void SIScheduleDAGMI::schedule()
1782{
1783  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1784  SIScheduleBlockResult Best, Temp;
1785  DEBUG(dbgs() << "Preparing Scheduling\n");
1786
1787  buildDAGWithRegPressure();
1788  DEBUG(
1789    for(SUnit& SU : SUnits)
1790       SU.dumpAll(this)
1791  );
1792
1793  topologicalSort();
1794  findRootsAndBiasEdges(TopRoots, BotRoots);
1795  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1796  // functions, but to make them happy we must initialize
1797  // the default Scheduler implementation (even if we do not
1798  // run it)
1799  SchedImpl->initialize(this);
1800  initQueues(TopRoots, BotRoots);
1801
1802  // Fill some stats to help scheduling.
1803
1804  SUnitsLinksBackup = SUnits;
1805  IsLowLatencySU.clear();
1806  LowLatencyOffset.clear();
1807  IsHighLatencySU.clear();
1808
1809  IsLowLatencySU.resize(SUnits.size(), 0);
1810  LowLatencyOffset.resize(SUnits.size(), 0);
1811  IsHighLatencySU.resize(SUnits.size(), 0);
1812
1813  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1814    SUnit *SU = &SUnits[i];
1815    unsigned BaseLatReg;
1816    int64_t OffLatReg;
1817    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1818      IsLowLatencySU[i] = 1;
1819      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1820                                       TRI))
1821        LowLatencyOffset[i] = OffLatReg;
1822    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1823      IsHighLatencySU[i] = 1;
1824  }
1825
1826  SIScheduler Scheduler(this);
1827  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1828                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1829
1830  // if VGPR usage is extremely high, try other good performing variants
1831  // which could lead to lower VGPR usage
1832  if (Best.MaxVGPRUsage > 180) {
1833    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1834      { LatenciesAlone, BlockRegUsageLatency },
1835//      { LatenciesAlone, BlockRegUsage },
1836      { LatenciesGrouped, BlockLatencyRegUsage },
1837//      { LatenciesGrouped, BlockRegUsageLatency },
1838//      { LatenciesGrouped, BlockRegUsage },
1839      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1840//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1841//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1842    };
1843    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1844      Temp = Scheduler.scheduleVariant(v.first, v.second);
1845      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1846        Best = Temp;
1847    }
1848  }
1849  // if VGPR usage is still extremely high, we may spill. Try other variants
1850  // which are less performing, but that could lead to lower VGPR usage.
1851  if (Best.MaxVGPRUsage > 200) {
1852    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1853//      { LatenciesAlone, BlockRegUsageLatency },
1854      { LatenciesAlone, BlockRegUsage },
1855//      { LatenciesGrouped, BlockLatencyRegUsage },
1856      { LatenciesGrouped, BlockRegUsageLatency },
1857      { LatenciesGrouped, BlockRegUsage },
1858//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1859      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1860      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1861    };
1862    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1863      Temp = Scheduler.scheduleVariant(v.first, v.second);
1864      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1865        Best = Temp;
1866    }
1867  }
1868
1869  ScheduledSUnits = Best.SUs;
1870  ScheduledSUnitsInv.resize(SUnits.size());
1871
1872  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1873    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1874  }
1875
1876  moveLowLatencies();
1877
1878  // Tell the outside world about the result of the scheduling.
1879
1880  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1881  TopRPTracker.setPos(CurrentTop);
1882
1883  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
1884       E = ScheduledSUnits.end(); I != E; ++I) {
1885    SUnit *SU = &SUnits[*I];
1886
1887    scheduleMI(SU, true);
1888
1889    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1890                 << *SU->getInstr());
1891  }
1892
1893  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1894
1895  placeDebugValues();
1896
1897  DEBUG({
1898      unsigned BBNum = begin()->getParent()->getNumber();
1899      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
1900      dumpSchedule();
1901      dbgs() << '\n';
1902    });
1903}
1904