1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief SI Machine Scheduler interface 12// 13//===----------------------------------------------------------------------===// 14 15#include "AMDGPU.h" 16#include "SIMachineScheduler.h" 17#include "llvm/CodeGen/LiveInterval.h" 18#include "llvm/CodeGen/LiveIntervalAnalysis.h" 19#include "llvm/CodeGen/MachineRegisterInfo.h" 20#include "llvm/CodeGen/MachineScheduler.h" 21#include "llvm/CodeGen/RegisterPressure.h" 22 23using namespace llvm; 24 25#define DEBUG_TYPE "misched" 26 27// This scheduler implements a different scheduling algorithm than 28// GenericScheduler. 29// 30// There are several specific architecture behaviours that can't be modelled 31// for GenericScheduler: 32// . When accessing the result of an SGPR load instruction, you have to wait 33// for all the SGPR load instructions before your current instruction to 34// have finished. 35// . When accessing the result of an VGPR load instruction, you have to wait 36// for all the VGPR load instructions previous to the VGPR load instruction 37// you are interested in to finish. 38// . The less the register pressure, the best load latencies are hidden 39// 40// Moreover some specifities (like the fact a lot of instructions in the shader 41// have few dependencies) makes the generic scheduler have some unpredictable 42// behaviours. For example when register pressure becomes high, it can either 43// manage to prevent register pressure from going too high, or it can 44// increase register pressure even more than if it hadn't taken register 45// pressure into account. 46// 47// Also some other bad behaviours are generated, like loading at the beginning 48// of the shader a constant in VGPR you won't need until the end of the shader. 49// 50// The scheduling problem for SI can distinguish three main parts: 51// . Hiding high latencies (texture sampling, etc) 52// . Hiding low latencies (SGPR constant loading, etc) 53// . Keeping register usage low for better latency hiding and general 54// performance 55// 56// Some other things can also affect performance, but are hard to predict 57// (cache usage, the fact the HW can issue several instructions from different 58// wavefronts if different types, etc) 59// 60// This scheduler tries to solve the scheduling problem by dividing it into 61// simpler sub-problems. It divides the instructions into blocks, schedules 62// locally inside the blocks where it takes care of low latencies, and then 63// chooses the order of the blocks by taking care of high latencies. 64// Dividing the instructions into blocks helps control keeping register 65// usage low. 66// 67// First the instructions are put into blocks. 68// We want the blocks help control register usage and hide high latencies 69// later. To help control register usage, we typically want all local 70// computations, when for example you create a result that can be comsummed 71// right away, to be contained in a block. Block inputs and outputs would 72// typically be important results that are needed in several locations of 73// the shader. Since we do want blocks to help hide high latencies, we want 74// the instructions inside the block to have a minimal set of dependencies 75// on high latencies. It will make it easy to pick blocks to hide specific 76// high latencies. 77// The block creation algorithm is divided into several steps, and several 78// variants can be tried during the scheduling process. 79// 80// Second the order of the instructions inside the blocks is choosen. 81// At that step we do take into account only register usage and hiding 82// low latency instructions 83// 84// Third the block order is choosen, there we try to hide high latencies 85// and keep register usage low. 86// 87// After the third step, a pass is done to improve the hiding of low 88// latencies. 89// 90// Actually when talking about 'low latency' or 'high latency' it includes 91// both the latency to get the cache (or global mem) data go to the register, 92// and the bandwith limitations. 93// Increasing the number of active wavefronts helps hide the former, but it 94// doesn't solve the latter, thus why even if wavefront count is high, we have 95// to try have as many instructions hiding high latencies as possible. 96// The OpenCL doc says for example latency of 400 cycles for a global mem access, 97// which is hidden by 10 instructions if the wavefront count is 10. 98 99// Some figures taken from AMD docs: 100// Both texture and constant L1 caches are 4-way associative with 64 bytes 101// lines. 102// Constant cache is shared with 4 CUs. 103// For texture sampling, the address generation unit receives 4 texture 104// addresses per cycle, thus we could expect texture sampling latency to be 105// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 106// instructions in a wavefront group are executed every 4 cycles), 107// or 16 instructions if the other wavefronts associated to the 3 other VALUs 108// of the CU do texture sampling too. (Don't take these figures too seriously, 109// as I'm not 100% sure of the computation) 110// Data exports should get similar latency. 111// For constant loading, the cache is shader with 4 CUs. 112// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 113// I guess if the other CU don't read the cache, it can go up to 64B/cycle. 114// It means a simple s_buffer_load should take one instruction to hide, as 115// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 116// cache line. 117// 118// As of today the driver doesn't preload the constants in cache, thus the 119// first loads get extra latency. The doc says global memory access can be 120// 300-600 cycles. We do not specially take that into account when scheduling 121// As we expect the driver to be able to preload the constants soon. 122 123 124// common code // 125 126#ifndef NDEBUG 127 128static const char *getReasonStr(SIScheduleCandReason Reason) { 129 switch (Reason) { 130 case NoCand: return "NOCAND"; 131 case RegUsage: return "REGUSAGE"; 132 case Latency: return "LATENCY"; 133 case Successor: return "SUCCESSOR"; 134 case Depth: return "DEPTH"; 135 case NodeOrder: return "ORDER"; 136 } 137 llvm_unreachable("Unknown reason!"); 138} 139 140#endif 141 142static bool tryLess(int TryVal, int CandVal, 143 SISchedulerCandidate &TryCand, 144 SISchedulerCandidate &Cand, 145 SIScheduleCandReason Reason) { 146 if (TryVal < CandVal) { 147 TryCand.Reason = Reason; 148 return true; 149 } 150 if (TryVal > CandVal) { 151 if (Cand.Reason > Reason) 152 Cand.Reason = Reason; 153 return true; 154 } 155 Cand.setRepeat(Reason); 156 return false; 157} 158 159static bool tryGreater(int TryVal, int CandVal, 160 SISchedulerCandidate &TryCand, 161 SISchedulerCandidate &Cand, 162 SIScheduleCandReason Reason) { 163 if (TryVal > CandVal) { 164 TryCand.Reason = Reason; 165 return true; 166 } 167 if (TryVal < CandVal) { 168 if (Cand.Reason > Reason) 169 Cand.Reason = Reason; 170 return true; 171 } 172 Cand.setRepeat(Reason); 173 return false; 174} 175 176// SIScheduleBlock // 177 178void SIScheduleBlock::addUnit(SUnit *SU) { 179 NodeNum2Index[SU->NodeNum] = SUnits.size(); 180 SUnits.push_back(SU); 181} 182 183#ifndef NDEBUG 184 185void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 186 187 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 188 dbgs() << '\n'; 189} 190#endif 191 192void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 193 SISchedCandidate &TryCand) { 194 // Initialize the candidate if needed. 195 if (!Cand.isValid()) { 196 TryCand.Reason = NodeOrder; 197 return; 198 } 199 200 if (Cand.SGPRUsage > 60 && 201 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) 202 return; 203 204 // Schedule low latency instructions as top as possible. 205 // Order of priority is: 206 // . Low latency instructions which do not depend on other low latency 207 // instructions we haven't waited for 208 // . Other instructions which do not depend on low latency instructions 209 // we haven't waited for 210 // . Low latencies 211 // . All other instructions 212 // Goal is to get: low latency instructions - independant instructions 213 // - (eventually some more low latency instructions) 214 // - instructions that depend on the first low latency instructions. 215 // If in the block there is a lot of constant loads, the SGPR usage 216 // could go quite high, thus above the arbitrary limit of 60 will encourage 217 // use the already loaded constants (in order to release some SGPRs) before 218 // loading more. 219 if (tryLess(TryCand.HasLowLatencyNonWaitedParent, 220 Cand.HasLowLatencyNonWaitedParent, 221 TryCand, Cand, SIScheduleCandReason::Depth)) 222 return; 223 224 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 225 TryCand, Cand, SIScheduleCandReason::Depth)) 226 return; 227 228 if (TryCand.IsLowLatency && 229 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 230 TryCand, Cand, SIScheduleCandReason::Depth)) 231 return; 232 233 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) 234 return; 235 236 // Fall through to original instruction order. 237 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 238 TryCand.Reason = NodeOrder; 239 } 240} 241 242SUnit* SIScheduleBlock::pickNode() { 243 SISchedCandidate TopCand; 244 245 for (SUnit* SU : TopReadySUs) { 246 SISchedCandidate TryCand; 247 std::vector<unsigned> pressure; 248 std::vector<unsigned> MaxPressure; 249 // Predict register usage after this instruction. 250 TryCand.SU = SU; 251 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 252 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; 253 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; 254 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 255 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 256 TryCand.HasLowLatencyNonWaitedParent = 257 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 258 tryCandidateTopDown(TopCand, TryCand); 259 if (TryCand.Reason != NoCand) 260 TopCand.setBest(TryCand); 261 } 262 263 return TopCand.SU; 264} 265 266 267// Schedule something valid. 268void SIScheduleBlock::fastSchedule() { 269 TopReadySUs.clear(); 270 if (Scheduled) 271 undoSchedule(); 272 273 for (SUnit* SU : SUnits) { 274 if (!SU->NumPredsLeft) 275 TopReadySUs.push_back(SU); 276 } 277 278 while (!TopReadySUs.empty()) { 279 SUnit *SU = TopReadySUs[0]; 280 ScheduledSUnits.push_back(SU); 281 nodeScheduled(SU); 282 } 283 284 Scheduled = true; 285} 286 287// Returns if the register was set between first and last. 288static bool isDefBetween(unsigned Reg, 289 SlotIndex First, SlotIndex Last, 290 const MachineRegisterInfo *MRI, 291 const LiveIntervals *LIS) { 292 for (MachineRegisterInfo::def_instr_iterator 293 UI = MRI->def_instr_begin(Reg), 294 UE = MRI->def_instr_end(); UI != UE; ++UI) { 295 const MachineInstr* MI = &*UI; 296 if (MI->isDebugValue()) 297 continue; 298 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 299 if (InstSlot >= First && InstSlot <= Last) 300 return true; 301 } 302 return false; 303} 304 305void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 306 MachineBasicBlock::iterator EndBlock) { 307 IntervalPressure Pressure, BotPressure; 308 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 309 LiveIntervals *LIS = DAG->getLIS(); 310 MachineRegisterInfo *MRI = DAG->getMRI(); 311 DAG->initRPTracker(TopRPTracker); 312 DAG->initRPTracker(BotRPTracker); 313 DAG->initRPTracker(RPTracker); 314 315 // Goes though all SU. RPTracker captures what had to be alive for the SUs 316 // to execute, and what is still alive at the end. 317 for (SUnit* SU : ScheduledSUnits) { 318 RPTracker.setPos(SU->getInstr()); 319 RPTracker.advance(); 320 } 321 322 // Close the RPTracker to finalize live ins/outs. 323 RPTracker.closeRegion(); 324 325 // Initialize the live ins and live outs. 326 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 327 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 328 329 // Do not Track Physical Registers, because it messes up. 330 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 331 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) 332 LiveInRegs.insert(RegMaskPair.RegUnit); 333 } 334 LiveOutRegs.clear(); 335 // There is several possibilities to distinguish: 336 // 1) Reg is not input to any instruction in the block, but is output of one 337 // 2) 1) + read in the block and not needed after it 338 // 3) 1) + read in the block but needed in another block 339 // 4) Reg is input of an instruction but another block will read it too 340 // 5) Reg is input of an instruction and then rewritten in the block. 341 // result is not read in the block (implies used in another block) 342 // 6) Reg is input of an instruction and then rewritten in the block. 343 // result is read in the block and not needed in another block 344 // 7) Reg is input of an instruction and then rewritten in the block. 345 // result is read in the block but also needed in another block 346 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 347 // We want LiveOutRegs to contain only Regs whose content will be read after 348 // in another block, and whose content was written in the current block, 349 // that is we want it to get 1, 3, 5, 7 350 // Since we made the MIs of a block to be packed all together before 351 // scheduling, then the LiveIntervals were correct, and the RPTracker was 352 // able to correctly handle 5 vs 6, 2 vs 3. 353 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 354 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 355 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 356 // The use of findDefBetween removes the case 4. 357 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 358 unsigned Reg = RegMaskPair.RegUnit; 359 if (TargetRegisterInfo::isVirtualRegister(Reg) && 360 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 361 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 362 LIS)) { 363 LiveOutRegs.insert(Reg); 364 } 365 } 366 367 // Pressure = sum_alive_registers register size 368 // Internally llvm will represent some registers as big 128 bits registers 369 // for example, but they actually correspond to 4 actual 32 bits registers. 370 // Thus Pressure is not equal to num_alive_registers * constant. 371 LiveInPressure = TopPressure.MaxSetPressure; 372 LiveOutPressure = BotPressure.MaxSetPressure; 373 374 // Prepares TopRPTracker for top down scheduling. 375 TopRPTracker.closeTop(); 376} 377 378void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 379 MachineBasicBlock::iterator EndBlock) { 380 if (!Scheduled) 381 fastSchedule(); 382 383 // PreScheduling phase to set LiveIn and LiveOut. 384 initRegPressure(BeginBlock, EndBlock); 385 undoSchedule(); 386 387 // Schedule for real now. 388 389 TopReadySUs.clear(); 390 391 for (SUnit* SU : SUnits) { 392 if (!SU->NumPredsLeft) 393 TopReadySUs.push_back(SU); 394 } 395 396 while (!TopReadySUs.empty()) { 397 SUnit *SU = pickNode(); 398 ScheduledSUnits.push_back(SU); 399 TopRPTracker.setPos(SU->getInstr()); 400 TopRPTracker.advance(); 401 nodeScheduled(SU); 402 } 403 404 // TODO: compute InternalAdditionnalPressure. 405 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 406 407 // Check everything is right. 408#ifndef NDEBUG 409 assert(SUnits.size() == ScheduledSUnits.size() && 410 TopReadySUs.empty()); 411 for (SUnit* SU : SUnits) { 412 assert(SU->isScheduled && 413 SU->NumPredsLeft == 0); 414 } 415#endif 416 417 Scheduled = true; 418} 419 420void SIScheduleBlock::undoSchedule() { 421 for (SUnit* SU : SUnits) { 422 SU->isScheduled = false; 423 for (SDep& Succ : SU->Succs) { 424 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 425 undoReleaseSucc(SU, &Succ); 426 } 427 } 428 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 429 ScheduledSUnits.clear(); 430 Scheduled = false; 431} 432 433void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 434 SUnit *SuccSU = SuccEdge->getSUnit(); 435 436 if (SuccEdge->isWeak()) { 437 ++SuccSU->WeakPredsLeft; 438 return; 439 } 440 ++SuccSU->NumPredsLeft; 441} 442 443void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 444 SUnit *SuccSU = SuccEdge->getSUnit(); 445 446 if (SuccEdge->isWeak()) { 447 --SuccSU->WeakPredsLeft; 448 return; 449 } 450#ifndef NDEBUG 451 if (SuccSU->NumPredsLeft == 0) { 452 dbgs() << "*** Scheduling failed! ***\n"; 453 SuccSU->dump(DAG); 454 dbgs() << " has been released too many times!\n"; 455 llvm_unreachable(nullptr); 456 } 457#endif 458 459 --SuccSU->NumPredsLeft; 460} 461 462/// Release Successors of the SU that are in the block or not. 463void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 464 for (SDep& Succ : SU->Succs) { 465 SUnit *SuccSU = Succ.getSUnit(); 466 467 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 468 continue; 469 470 releaseSucc(SU, &Succ); 471 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 472 TopReadySUs.push_back(SuccSU); 473 } 474} 475 476void SIScheduleBlock::nodeScheduled(SUnit *SU) { 477 // Is in TopReadySUs 478 assert (!SU->NumPredsLeft); 479 std::vector<SUnit*>::iterator I = 480 std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); 481 if (I == TopReadySUs.end()) { 482 dbgs() << "Data Structure Bug in SI Scheduler\n"; 483 llvm_unreachable(nullptr); 484 } 485 TopReadySUs.erase(I); 486 487 releaseSuccessors(SU, true); 488 // Scheduling this node will trigger a wait, 489 // thus propagate to other instructions that they do not need to wait either. 490 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 491 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 492 493 if (DAG->IsLowLatencySU[SU->NodeNum]) { 494 for (SDep& Succ : SU->Succs) { 495 std::map<unsigned, unsigned>::iterator I = 496 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 497 if (I != NodeNum2Index.end()) 498 HasLowLatencyNonWaitedParent[I->second] = 1; 499 } 500 } 501 SU->isScheduled = true; 502} 503 504void SIScheduleBlock::finalizeUnits() { 505 // We remove links from outside blocks to enable scheduling inside the block. 506 for (SUnit* SU : SUnits) { 507 releaseSuccessors(SU, false); 508 if (DAG->IsHighLatencySU[SU->NodeNum]) 509 HighLatencyBlock = true; 510 } 511 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 512} 513 514// we maintain ascending order of IDs 515void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 516 unsigned PredID = Pred->getID(); 517 518 // Check if not already predecessor. 519 for (SIScheduleBlock* P : Preds) { 520 if (PredID == P->getID()) 521 return; 522 } 523 Preds.push_back(Pred); 524 525 assert(none_of(Succs, 526 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && 527 "Loop in the Block Graph!"); 528} 529 530void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { 531 unsigned SuccID = Succ->getID(); 532 533 // Check if not already predecessor. 534 for (SIScheduleBlock* S : Succs) { 535 if (SuccID == S->getID()) 536 return; 537 } 538 if (Succ->isHighLatencyBlock()) 539 ++NumHighLatencySuccessors; 540 Succs.push_back(Succ); 541 assert(none_of(Preds, 542 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 543 "Loop in the Block Graph!"); 544} 545 546#ifndef NDEBUG 547void SIScheduleBlock::printDebug(bool full) { 548 dbgs() << "Block (" << ID << ")\n"; 549 if (!full) 550 return; 551 552 dbgs() << "\nContains High Latency Instruction: " 553 << HighLatencyBlock << '\n'; 554 dbgs() << "\nDepends On:\n"; 555 for (SIScheduleBlock* P : Preds) { 556 P->printDebug(false); 557 } 558 559 dbgs() << "\nSuccessors:\n"; 560 for (SIScheduleBlock* S : Succs) { 561 S->printDebug(false); 562 } 563 564 if (Scheduled) { 565 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' 566 << LiveInPressure[DAG->getVGPRSetID()] << '\n'; 567 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' 568 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; 569 dbgs() << "LiveIns:\n"; 570 for (unsigned Reg : LiveInRegs) 571 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 572 573 dbgs() << "\nLiveOuts:\n"; 574 for (unsigned Reg : LiveOutRegs) 575 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 576 } 577 578 dbgs() << "\nInstructions:\n"; 579 if (!Scheduled) { 580 for (SUnit* SU : SUnits) { 581 SU->dump(DAG); 582 } 583 } else { 584 for (SUnit* SU : SUnits) { 585 SU->dump(DAG); 586 } 587 } 588 589 dbgs() << "///////////////////////\n"; 590} 591 592#endif 593 594// SIScheduleBlockCreator // 595 596SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : 597DAG(DAG) { 598} 599 600SIScheduleBlockCreator::~SIScheduleBlockCreator() { 601} 602 603SIScheduleBlocks 604SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 605 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 606 Blocks.find(BlockVariant); 607 if (B == Blocks.end()) { 608 SIScheduleBlocks Res; 609 createBlocksForVariant(BlockVariant); 610 topologicalSort(); 611 scheduleInsideBlocks(); 612 fillStats(); 613 Res.Blocks = CurrentBlocks; 614 Res.TopDownIndex2Block = TopDownIndex2Block; 615 Res.TopDownBlock2Index = TopDownBlock2Index; 616 Blocks[BlockVariant] = Res; 617 return Res; 618 } else { 619 return B->second; 620 } 621} 622 623bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 624 if (SU->NodeNum >= DAG->SUnits.size()) 625 return false; 626 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 627} 628 629void SIScheduleBlockCreator::colorHighLatenciesAlone() { 630 unsigned DAGSize = DAG->SUnits.size(); 631 632 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 633 SUnit *SU = &DAG->SUnits[i]; 634 if (DAG->IsHighLatencySU[SU->NodeNum]) { 635 CurrentColoring[SU->NodeNum] = NextReservedID++; 636 } 637 } 638} 639 640void SIScheduleBlockCreator::colorHighLatenciesGroups() { 641 unsigned DAGSize = DAG->SUnits.size(); 642 unsigned NumHighLatencies = 0; 643 unsigned GroupSize; 644 unsigned Color = NextReservedID; 645 unsigned Count = 0; 646 std::set<unsigned> FormingGroup; 647 648 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 649 SUnit *SU = &DAG->SUnits[i]; 650 if (DAG->IsHighLatencySU[SU->NodeNum]) 651 ++NumHighLatencies; 652 } 653 654 if (NumHighLatencies == 0) 655 return; 656 657 if (NumHighLatencies <= 6) 658 GroupSize = 2; 659 else if (NumHighLatencies <= 12) 660 GroupSize = 3; 661 else 662 GroupSize = 4; 663 664 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 665 SUnit *SU = &DAG->SUnits[i]; 666 if (DAG->IsHighLatencySU[SU->NodeNum]) { 667 unsigned CompatibleGroup = true; 668 unsigned ProposedColor = Color; 669 for (unsigned j : FormingGroup) { 670 // TODO: Currently CompatibleGroup will always be false, 671 // because the graph enforces the load order. This 672 // can be fixed, but as keeping the load order is often 673 // good for performance that causes a performance hit (both 674 // the default scheduler and this scheduler). 675 // When this scheduler determines a good load order, 676 // this can be fixed. 677 if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || 678 !DAG->canAddEdge(&DAG->SUnits[j], SU)) 679 CompatibleGroup = false; 680 } 681 if (!CompatibleGroup || ++Count == GroupSize) { 682 FormingGroup.clear(); 683 Color = ++NextReservedID; 684 if (!CompatibleGroup) { 685 ProposedColor = Color; 686 FormingGroup.insert(SU->NodeNum); 687 } 688 Count = 0; 689 } else { 690 FormingGroup.insert(SU->NodeNum); 691 } 692 CurrentColoring[SU->NodeNum] = ProposedColor; 693 } 694 } 695} 696 697void SIScheduleBlockCreator::colorComputeReservedDependencies() { 698 unsigned DAGSize = DAG->SUnits.size(); 699 std::map<std::set<unsigned>, unsigned> ColorCombinations; 700 701 CurrentTopDownReservedDependencyColoring.clear(); 702 CurrentBottomUpReservedDependencyColoring.clear(); 703 704 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 705 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 706 707 // Traverse TopDown, and give different colors to SUs depending 708 // on which combination of High Latencies they depend on. 709 710 for (unsigned SUNum : DAG->TopDownIndex2SU) { 711 SUnit *SU = &DAG->SUnits[SUNum]; 712 std::set<unsigned> SUColors; 713 714 // Already given. 715 if (CurrentColoring[SU->NodeNum]) { 716 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 717 CurrentColoring[SU->NodeNum]; 718 continue; 719 } 720 721 for (SDep& PredDep : SU->Preds) { 722 SUnit *Pred = PredDep.getSUnit(); 723 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 724 continue; 725 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 726 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 727 } 728 // Color 0 by default. 729 if (SUColors.empty()) 730 continue; 731 // Same color than parents. 732 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 733 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 734 *SUColors.begin(); 735 else { 736 std::map<std::set<unsigned>, unsigned>::iterator Pos = 737 ColorCombinations.find(SUColors); 738 if (Pos != ColorCombinations.end()) { 739 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 740 } else { 741 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 742 NextNonReservedID; 743 ColorCombinations[SUColors] = NextNonReservedID++; 744 } 745 } 746 } 747 748 ColorCombinations.clear(); 749 750 // Same as before, but BottomUp. 751 752 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 753 SUnit *SU = &DAG->SUnits[SUNum]; 754 std::set<unsigned> SUColors; 755 756 // Already given. 757 if (CurrentColoring[SU->NodeNum]) { 758 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 759 CurrentColoring[SU->NodeNum]; 760 continue; 761 } 762 763 for (SDep& SuccDep : SU->Succs) { 764 SUnit *Succ = SuccDep.getSUnit(); 765 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 766 continue; 767 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 768 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 769 } 770 // Keep color 0. 771 if (SUColors.empty()) 772 continue; 773 // Same color than parents. 774 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 775 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 776 *SUColors.begin(); 777 else { 778 std::map<std::set<unsigned>, unsigned>::iterator Pos = 779 ColorCombinations.find(SUColors); 780 if (Pos != ColorCombinations.end()) { 781 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 782 } else { 783 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 784 NextNonReservedID; 785 ColorCombinations[SUColors] = NextNonReservedID++; 786 } 787 } 788 } 789} 790 791void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 792 unsigned DAGSize = DAG->SUnits.size(); 793 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 794 795 // Every combination of colors given by the top down 796 // and bottom up Reserved node dependency 797 798 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 799 SUnit *SU = &DAG->SUnits[i]; 800 std::pair<unsigned, unsigned> SUColors; 801 802 // High latency instructions: already given. 803 if (CurrentColoring[SU->NodeNum]) 804 continue; 805 806 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 807 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 808 809 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 810 ColorCombinations.find(SUColors); 811 if (Pos != ColorCombinations.end()) { 812 CurrentColoring[SU->NodeNum] = Pos->second; 813 } else { 814 CurrentColoring[SU->NodeNum] = NextNonReservedID; 815 ColorCombinations[SUColors] = NextNonReservedID++; 816 } 817 } 818} 819 820void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 821 unsigned DAGSize = DAG->SUnits.size(); 822 std::vector<int> PendingColoring = CurrentColoring; 823 824 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 825 SUnit *SU = &DAG->SUnits[SUNum]; 826 std::set<unsigned> SUColors; 827 std::set<unsigned> SUColorsPending; 828 829 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 830 continue; 831 832 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 833 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 834 continue; 835 836 for (SDep& SuccDep : SU->Succs) { 837 SUnit *Succ = SuccDep.getSUnit(); 838 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 839 continue; 840 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 841 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 842 SUColors.insert(CurrentColoring[Succ->NodeNum]); 843 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 844 } 845 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 846 PendingColoring[SU->NodeNum] = *SUColors.begin(); 847 else // TODO: Attribute new colors depending on color 848 // combination of children. 849 PendingColoring[SU->NodeNum] = NextNonReservedID++; 850 } 851 CurrentColoring = PendingColoring; 852} 853 854 855void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 856 unsigned DAGSize = DAG->SUnits.size(); 857 unsigned PreviousColor; 858 std::set<unsigned> SeenColors; 859 860 if (DAGSize <= 1) 861 return; 862 863 PreviousColor = CurrentColoring[0]; 864 865 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 866 SUnit *SU = &DAG->SUnits[i]; 867 unsigned CurrentColor = CurrentColoring[i]; 868 unsigned PreviousColorSave = PreviousColor; 869 assert(i == SU->NodeNum); 870 871 if (CurrentColor != PreviousColor) 872 SeenColors.insert(PreviousColor); 873 PreviousColor = CurrentColor; 874 875 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 876 continue; 877 878 if (SeenColors.find(CurrentColor) == SeenColors.end()) 879 continue; 880 881 if (PreviousColorSave != CurrentColor) 882 CurrentColoring[i] = NextNonReservedID++; 883 else 884 CurrentColoring[i] = CurrentColoring[i-1]; 885 } 886} 887 888void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 889 unsigned DAGSize = DAG->SUnits.size(); 890 891 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 892 SUnit *SU = &DAG->SUnits[SUNum]; 893 std::set<unsigned> SUColors; 894 895 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 896 continue; 897 898 // No predecessor: Vgpr constant loading. 899 // Low latency instructions usually have a predecessor (the address) 900 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 901 continue; 902 903 for (SDep& SuccDep : SU->Succs) { 904 SUnit *Succ = SuccDep.getSUnit(); 905 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 906 continue; 907 SUColors.insert(CurrentColoring[Succ->NodeNum]); 908 } 909 if (SUColors.size() == 1) 910 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 911 } 912} 913 914void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 915 unsigned DAGSize = DAG->SUnits.size(); 916 917 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 918 SUnit *SU = &DAG->SUnits[SUNum]; 919 std::set<unsigned> SUColors; 920 921 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 922 continue; 923 924 for (SDep& SuccDep : SU->Succs) { 925 SUnit *Succ = SuccDep.getSUnit(); 926 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 927 continue; 928 SUColors.insert(CurrentColoring[Succ->NodeNum]); 929 } 930 if (SUColors.size() == 1) 931 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 932 } 933} 934 935void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 936 unsigned DAGSize = DAG->SUnits.size(); 937 938 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 939 SUnit *SU = &DAG->SUnits[SUNum]; 940 std::set<unsigned> SUColors; 941 942 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 943 continue; 944 945 for (SDep& SuccDep : SU->Succs) { 946 SUnit *Succ = SuccDep.getSUnit(); 947 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 948 continue; 949 SUColors.insert(CurrentColoring[Succ->NodeNum]); 950 } 951 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 952 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 953 } 954} 955 956void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 957 unsigned DAGSize = DAG->SUnits.size(); 958 std::map<unsigned, unsigned> ColorCount; 959 960 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 961 SUnit *SU = &DAG->SUnits[SUNum]; 962 unsigned color = CurrentColoring[SU->NodeNum]; 963 std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); 964 if (Pos != ColorCount.end()) { 965 ++ColorCount[color]; 966 } else { 967 ColorCount[color] = 1; 968 } 969 } 970 971 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 972 SUnit *SU = &DAG->SUnits[SUNum]; 973 unsigned color = CurrentColoring[SU->NodeNum]; 974 std::set<unsigned> SUColors; 975 976 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 977 continue; 978 979 if (ColorCount[color] > 1) 980 continue; 981 982 for (SDep& SuccDep : SU->Succs) { 983 SUnit *Succ = SuccDep.getSUnit(); 984 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 985 continue; 986 SUColors.insert(CurrentColoring[Succ->NodeNum]); 987 } 988 if (SUColors.size() == 1 && *SUColors.begin() != color) { 989 --ColorCount[color]; 990 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 991 ++ColorCount[*SUColors.begin()]; 992 } 993 } 994} 995 996void SIScheduleBlockCreator::cutHugeBlocks() { 997 // TODO 998} 999 1000void SIScheduleBlockCreator::regroupNoUserInstructions() { 1001 unsigned DAGSize = DAG->SUnits.size(); 1002 int GroupID = NextNonReservedID++; 1003 1004 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1005 SUnit *SU = &DAG->SUnits[SUNum]; 1006 bool hasSuccessor = false; 1007 1008 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1009 continue; 1010 1011 for (SDep& SuccDep : SU->Succs) { 1012 SUnit *Succ = SuccDep.getSUnit(); 1013 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1014 continue; 1015 hasSuccessor = true; 1016 } 1017 if (!hasSuccessor) 1018 CurrentColoring[SU->NodeNum] = GroupID; 1019 } 1020} 1021 1022void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1023 unsigned DAGSize = DAG->SUnits.size(); 1024 std::map<unsigned,unsigned> RealID; 1025 1026 CurrentBlocks.clear(); 1027 CurrentColoring.clear(); 1028 CurrentColoring.resize(DAGSize, 0); 1029 Node2CurrentBlock.clear(); 1030 1031 // Restore links previous scheduling variant has overridden. 1032 DAG->restoreSULinksLeft(); 1033 1034 NextReservedID = 1; 1035 NextNonReservedID = DAGSize + 1; 1036 1037 DEBUG(dbgs() << "Coloring the graph\n"); 1038 1039 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1040 colorHighLatenciesGroups(); 1041 else 1042 colorHighLatenciesAlone(); 1043 colorComputeReservedDependencies(); 1044 colorAccordingToReservedDependencies(); 1045 colorEndsAccordingToDependencies(); 1046 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1047 colorForceConsecutiveOrderInGroup(); 1048 regroupNoUserInstructions(); 1049 colorMergeConstantLoadsNextGroup(); 1050 colorMergeIfPossibleNextGroupOnlyForReserved(); 1051 1052 // Put SUs of same color into same block 1053 Node2CurrentBlock.resize(DAGSize, -1); 1054 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1055 SUnit *SU = &DAG->SUnits[i]; 1056 unsigned Color = CurrentColoring[SU->NodeNum]; 1057 if (RealID.find(Color) == RealID.end()) { 1058 int ID = CurrentBlocks.size(); 1059 BlockPtrs.push_back( 1060 make_unique<SIScheduleBlock>(DAG, this, ID)); 1061 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1062 RealID[Color] = ID; 1063 } 1064 CurrentBlocks[RealID[Color]]->addUnit(SU); 1065 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1066 } 1067 1068 // Build dependencies between blocks. 1069 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1070 SUnit *SU = &DAG->SUnits[i]; 1071 int SUID = Node2CurrentBlock[i]; 1072 for (SDep& SuccDep : SU->Succs) { 1073 SUnit *Succ = SuccDep.getSUnit(); 1074 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1075 continue; 1076 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1077 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); 1078 } 1079 for (SDep& PredDep : SU->Preds) { 1080 SUnit *Pred = PredDep.getSUnit(); 1081 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1082 continue; 1083 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1084 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1085 } 1086 } 1087 1088 // Free root and leafs of all blocks to enable scheduling inside them. 1089 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1090 SIScheduleBlock *Block = CurrentBlocks[i]; 1091 Block->finalizeUnits(); 1092 } 1093 DEBUG( 1094 dbgs() << "Blocks created:\n\n"; 1095 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1096 SIScheduleBlock *Block = CurrentBlocks[i]; 1097 Block->printDebug(true); 1098 } 1099 ); 1100} 1101 1102// Two functions taken from Codegen/MachineScheduler.cpp 1103 1104/// If this iterator is a debug value, increment until reaching the End or a 1105/// non-debug instruction. 1106static MachineBasicBlock::const_iterator 1107nextIfDebug(MachineBasicBlock::const_iterator I, 1108 MachineBasicBlock::const_iterator End) { 1109 for(; I != End; ++I) { 1110 if (!I->isDebugValue()) 1111 break; 1112 } 1113 return I; 1114} 1115 1116/// Non-const version. 1117static MachineBasicBlock::iterator 1118nextIfDebug(MachineBasicBlock::iterator I, 1119 MachineBasicBlock::const_iterator End) { 1120 // Cast the return value to nonconst MachineInstr, then cast to an 1121 // instr_iterator, which does not check for null, finally return a 1122 // bundle_iterator. 1123 return MachineBasicBlock::instr_iterator( 1124 const_cast<MachineInstr*>( 1125 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); 1126} 1127 1128void SIScheduleBlockCreator::topologicalSort() { 1129 unsigned DAGSize = CurrentBlocks.size(); 1130 std::vector<int> WorkList; 1131 1132 DEBUG(dbgs() << "Topological Sort\n"); 1133 1134 WorkList.reserve(DAGSize); 1135 TopDownIndex2Block.resize(DAGSize); 1136 TopDownBlock2Index.resize(DAGSize); 1137 BottomUpIndex2Block.resize(DAGSize); 1138 1139 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1140 SIScheduleBlock *Block = CurrentBlocks[i]; 1141 unsigned Degree = Block->getSuccs().size(); 1142 TopDownBlock2Index[i] = Degree; 1143 if (Degree == 0) { 1144 WorkList.push_back(i); 1145 } 1146 } 1147 1148 int Id = DAGSize; 1149 while (!WorkList.empty()) { 1150 int i = WorkList.back(); 1151 SIScheduleBlock *Block = CurrentBlocks[i]; 1152 WorkList.pop_back(); 1153 TopDownBlock2Index[i] = --Id; 1154 TopDownIndex2Block[Id] = i; 1155 for (SIScheduleBlock* Pred : Block->getPreds()) { 1156 if (!--TopDownBlock2Index[Pred->getID()]) 1157 WorkList.push_back(Pred->getID()); 1158 } 1159 } 1160 1161#ifndef NDEBUG 1162 // Check correctness of the ordering. 1163 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1164 SIScheduleBlock *Block = CurrentBlocks[i]; 1165 for (SIScheduleBlock* Pred : Block->getPreds()) { 1166 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1167 "Wrong Top Down topological sorting"); 1168 } 1169 } 1170#endif 1171 1172 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1173 TopDownIndex2Block.rend()); 1174} 1175 1176void SIScheduleBlockCreator::scheduleInsideBlocks() { 1177 unsigned DAGSize = CurrentBlocks.size(); 1178 1179 DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1180 1181 // We do schedule a valid scheduling such that a Block corresponds 1182 // to a range of instructions. 1183 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1184 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1185 SIScheduleBlock *Block = CurrentBlocks[i]; 1186 Block->fastSchedule(); 1187 } 1188 1189 // Note: the following code, and the part restoring previous position 1190 // is by far the most expensive operation of the Scheduler. 1191 1192 // Do not update CurrentTop. 1193 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1194 std::vector<MachineBasicBlock::iterator> PosOld; 1195 std::vector<MachineBasicBlock::iterator> PosNew; 1196 PosOld.reserve(DAG->SUnits.size()); 1197 PosNew.reserve(DAG->SUnits.size()); 1198 1199 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1200 int BlockIndice = TopDownIndex2Block[i]; 1201 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1202 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1203 1204 for (SUnit* SU : SUs) { 1205 MachineInstr *MI = SU->getInstr(); 1206 MachineBasicBlock::iterator Pos = MI; 1207 PosOld.push_back(Pos); 1208 if (&*CurrentTopFastSched == MI) { 1209 PosNew.push_back(Pos); 1210 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1211 DAG->getCurrentBottom()); 1212 } else { 1213 // Update the instruction stream. 1214 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1215 1216 // Update LiveIntervals. 1217 // Note: Moving all instructions and calling handleMove everytime 1218 // is the most cpu intensive operation of the scheduler. 1219 // It would gain a lot if there was a way to recompute the 1220 // LiveIntervals for the entire scheduling region. 1221 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1222 PosNew.push_back(CurrentTopFastSched); 1223 } 1224 } 1225 } 1226 1227 // Now we have Block of SUs == Block of MI. 1228 // We do the final schedule for the instructions inside the block. 1229 // The property that all the SUs of the Block are grouped together as MI 1230 // is used for correct reg usage tracking. 1231 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1232 SIScheduleBlock *Block = CurrentBlocks[i]; 1233 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1234 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1235 } 1236 1237 DEBUG(dbgs() << "Restoring MI Pos\n"); 1238 // Restore old ordering (which prevents a LIS->handleMove bug). 1239 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1240 MachineBasicBlock::iterator POld = PosOld[i-1]; 1241 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1242 if (PNew != POld) { 1243 // Update the instruction stream. 1244 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1245 1246 // Update LiveIntervals. 1247 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1248 } 1249 } 1250 1251 DEBUG( 1252 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1253 SIScheduleBlock *Block = CurrentBlocks[i]; 1254 Block->printDebug(true); 1255 } 1256 ); 1257} 1258 1259void SIScheduleBlockCreator::fillStats() { 1260 unsigned DAGSize = CurrentBlocks.size(); 1261 1262 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1263 int BlockIndice = TopDownIndex2Block[i]; 1264 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1265 if (Block->getPreds().size() == 0) 1266 Block->Depth = 0; 1267 else { 1268 unsigned Depth = 0; 1269 for (SIScheduleBlock *Pred : Block->getPreds()) { 1270 if (Depth < Pred->Depth + 1) 1271 Depth = Pred->Depth + 1; 1272 } 1273 Block->Depth = Depth; 1274 } 1275 } 1276 1277 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1278 int BlockIndice = BottomUpIndex2Block[i]; 1279 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1280 if (Block->getSuccs().size() == 0) 1281 Block->Height = 0; 1282 else { 1283 unsigned Height = 0; 1284 for (SIScheduleBlock *Succ : Block->getSuccs()) { 1285 if (Height < Succ->Height + 1) 1286 Height = Succ->Height + 1; 1287 } 1288 Block->Height = Height; 1289 } 1290 } 1291} 1292 1293// SIScheduleBlockScheduler // 1294 1295SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1296 SISchedulerBlockSchedulerVariant Variant, 1297 SIScheduleBlocks BlocksStruct) : 1298 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1299 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1300 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1301 1302 // Fill the usage of every output 1303 // Warning: while by construction we always have a link between two blocks 1304 // when one needs a result from the other, the number of users of an output 1305 // is not the sum of child blocks having as input the same virtual register. 1306 // Here is an example. A produces x and y. B eats x and produces x'. 1307 // C eats x' and y. The register coalescer may have attributed the same 1308 // virtual register to x and x'. 1309 // To count accurately, we do a topological sort. In case the register is 1310 // found for several parents, we increment the usage of the one with the 1311 // highest topological index. 1312 LiveOutRegsNumUsages.resize(Blocks.size()); 1313 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1314 SIScheduleBlock *Block = Blocks[i]; 1315 for (unsigned Reg : Block->getInRegs()) { 1316 bool Found = false; 1317 int topoInd = -1; 1318 for (SIScheduleBlock* Pred: Block->getPreds()) { 1319 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1320 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1321 1322 if (RegPos != PredOutRegs.end()) { 1323 Found = true; 1324 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1325 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1326 } 1327 } 1328 } 1329 1330 if (!Found) 1331 continue; 1332 1333 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1334 std::map<unsigned, unsigned>::iterator RegPos = 1335 LiveOutRegsNumUsages[PredID].find(Reg); 1336 if (RegPos != LiveOutRegsNumUsages[PredID].end()) { 1337 ++LiveOutRegsNumUsages[PredID][Reg]; 1338 } else { 1339 LiveOutRegsNumUsages[PredID][Reg] = 1; 1340 } 1341 } 1342 } 1343 1344 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1345 BlockNumPredsLeft.resize(Blocks.size()); 1346 BlockNumSuccsLeft.resize(Blocks.size()); 1347 1348 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1349 SIScheduleBlock *Block = Blocks[i]; 1350 BlockNumPredsLeft[i] = Block->getPreds().size(); 1351 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1352 } 1353 1354#ifndef NDEBUG 1355 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1356 SIScheduleBlock *Block = Blocks[i]; 1357 assert(Block->getID() == i); 1358 } 1359#endif 1360 1361 std::set<unsigned> InRegs = DAG->getInRegs(); 1362 addLiveRegs(InRegs); 1363 1364 // Fill LiveRegsConsumers for regs that were already 1365 // defined before scheduling. 1366 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1367 SIScheduleBlock *Block = Blocks[i]; 1368 for (unsigned Reg : Block->getInRegs()) { 1369 bool Found = false; 1370 for (SIScheduleBlock* Pred: Block->getPreds()) { 1371 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1372 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1373 1374 if (RegPos != PredOutRegs.end()) { 1375 Found = true; 1376 break; 1377 } 1378 } 1379 1380 if (!Found) { 1381 if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) 1382 LiveRegsConsumers[Reg] = 1; 1383 else 1384 ++LiveRegsConsumers[Reg]; 1385 } 1386 } 1387 } 1388 1389 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1390 SIScheduleBlock *Block = Blocks[i]; 1391 if (BlockNumPredsLeft[i] == 0) { 1392 ReadyBlocks.push_back(Block); 1393 } 1394 } 1395 1396 while (SIScheduleBlock *Block = pickBlock()) { 1397 BlocksScheduled.push_back(Block); 1398 blockScheduled(Block); 1399 } 1400 1401 DEBUG( 1402 dbgs() << "Block Order:"; 1403 for (SIScheduleBlock* Block : BlocksScheduled) { 1404 dbgs() << ' ' << Block->getID(); 1405 } 1406 ); 1407} 1408 1409bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1410 SIBlockSchedCandidate &TryCand) { 1411 if (!Cand.isValid()) { 1412 TryCand.Reason = NodeOrder; 1413 return true; 1414 } 1415 1416 // Try to hide high latencies. 1417 if (tryLess(TryCand.LastPosHighLatParentScheduled, 1418 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1419 return true; 1420 // Schedule high latencies early so you can hide them better. 1421 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1422 TryCand, Cand, Latency)) 1423 return true; 1424 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, 1425 TryCand, Cand, Depth)) 1426 return true; 1427 if (tryGreater(TryCand.NumHighLatencySuccessors, 1428 Cand.NumHighLatencySuccessors, 1429 TryCand, Cand, Successor)) 1430 return true; 1431 return false; 1432} 1433 1434bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1435 SIBlockSchedCandidate &TryCand) { 1436 if (!Cand.isValid()) { 1437 TryCand.Reason = NodeOrder; 1438 return true; 1439 } 1440 1441 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1442 TryCand, Cand, RegUsage)) 1443 return true; 1444 if (tryGreater(TryCand.NumSuccessors > 0, 1445 Cand.NumSuccessors > 0, 1446 TryCand, Cand, Successor)) 1447 return true; 1448 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1449 return true; 1450 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1451 TryCand, Cand, RegUsage)) 1452 return true; 1453 return false; 1454} 1455 1456SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1457 SIBlockSchedCandidate Cand; 1458 std::vector<SIScheduleBlock*>::iterator Best; 1459 SIScheduleBlock *Block; 1460 if (ReadyBlocks.empty()) 1461 return nullptr; 1462 1463 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1464 VregCurrentUsage, SregCurrentUsage); 1465 if (VregCurrentUsage > maxVregUsage) 1466 maxVregUsage = VregCurrentUsage; 1467 if (VregCurrentUsage > maxSregUsage) 1468 maxSregUsage = VregCurrentUsage; 1469 DEBUG( 1470 dbgs() << "Picking New Blocks\n"; 1471 dbgs() << "Available: "; 1472 for (SIScheduleBlock* Block : ReadyBlocks) 1473 dbgs() << Block->getID() << ' '; 1474 dbgs() << "\nCurrent Live:\n"; 1475 for (unsigned Reg : LiveRegs) 1476 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1477 dbgs() << '\n'; 1478 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1479 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; 1480 ); 1481 1482 Cand.Block = nullptr; 1483 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1484 E = ReadyBlocks.end(); I != E; ++I) { 1485 SIBlockSchedCandidate TryCand; 1486 TryCand.Block = *I; 1487 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1488 TryCand.VGPRUsageDiff = 1489 checkRegUsageImpact(TryCand.Block->getInRegs(), 1490 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; 1491 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1492 TryCand.NumHighLatencySuccessors = 1493 TryCand.Block->getNumHighLatencySuccessors(); 1494 TryCand.LastPosHighLatParentScheduled = 1495 (unsigned int) std::max<int> (0, 1496 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1497 LastPosWaitedHighLatency); 1498 TryCand.Height = TryCand.Block->Height; 1499 // Try not to increase VGPR usage too much, else we may spill. 1500 if (VregCurrentUsage > 120 || 1501 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1502 if (!tryCandidateRegUsage(Cand, TryCand) && 1503 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1504 tryCandidateLatency(Cand, TryCand); 1505 } else { 1506 if (!tryCandidateLatency(Cand, TryCand)) 1507 tryCandidateRegUsage(Cand, TryCand); 1508 } 1509 if (TryCand.Reason != NoCand) { 1510 Cand.setBest(TryCand); 1511 Best = I; 1512 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1513 << getReasonStr(Cand.Reason) << '\n'); 1514 } 1515 } 1516 1517 DEBUG( 1518 dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1519 dbgs() << "Is a block with high latency instruction: " 1520 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1521 dbgs() << "Position of last high latency dependency: " 1522 << Cand.LastPosHighLatParentScheduled << '\n'; 1523 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1524 dbgs() << '\n'; 1525 ); 1526 1527 Block = Cand.Block; 1528 ReadyBlocks.erase(Best); 1529 return Block; 1530} 1531 1532// Tracking of currently alive registers to determine VGPR Usage. 1533 1534void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1535 for (unsigned Reg : Regs) { 1536 // For now only track virtual registers. 1537 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1538 continue; 1539 // If not already in the live set, then add it. 1540 (void) LiveRegs.insert(Reg); 1541 } 1542} 1543 1544void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1545 std::set<unsigned> &Regs) { 1546 for (unsigned Reg : Regs) { 1547 // For now only track virtual registers. 1548 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1549 assert (Pos != LiveRegs.end() && // Reg must be live. 1550 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1551 LiveRegsConsumers[Reg] >= 1); 1552 --LiveRegsConsumers[Reg]; 1553 if (LiveRegsConsumers[Reg] == 0) 1554 LiveRegs.erase(Pos); 1555 } 1556} 1557 1558void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1559 for (SIScheduleBlock* Block : Parent->getSuccs()) { 1560 --BlockNumPredsLeft[Block->getID()]; 1561 if (BlockNumPredsLeft[Block->getID()] == 0) { 1562 ReadyBlocks.push_back(Block); 1563 } 1564 // TODO: Improve check. When the dependency between the high latency 1565 // instructions and the instructions of the other blocks are WAR or WAW 1566 // there will be no wait triggered. We would like these cases to not 1567 // update LastPosHighLatencyParentScheduled. 1568 if (Parent->isHighLatencyBlock()) 1569 LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; 1570 } 1571} 1572 1573void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1574 decreaseLiveRegs(Block, Block->getInRegs()); 1575 addLiveRegs(Block->getOutRegs()); 1576 releaseBlockSuccs(Block); 1577 for (std::map<unsigned, unsigned>::iterator RegI = 1578 LiveOutRegsNumUsages[Block->getID()].begin(), 1579 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1580 std::pair<unsigned, unsigned> RegP = *RegI; 1581 if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) 1582 LiveRegsConsumers[RegP.first] = RegP.second; 1583 else { 1584 assert(LiveRegsConsumers[RegP.first] == 0); 1585 LiveRegsConsumers[RegP.first] += RegP.second; 1586 } 1587 } 1588 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1589 (unsigned)LastPosWaitedHighLatency) 1590 LastPosWaitedHighLatency = 1591 LastPosHighLatencyParentScheduled[Block->getID()]; 1592 ++NumBlockScheduled; 1593} 1594 1595std::vector<int> 1596SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1597 std::set<unsigned> &OutRegs) { 1598 std::vector<int> DiffSetPressure; 1599 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1600 1601 for (unsigned Reg : InRegs) { 1602 // For now only track virtual registers. 1603 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1604 continue; 1605 if (LiveRegsConsumers[Reg] > 1) 1606 continue; 1607 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1608 for (; PSetI.isValid(); ++PSetI) { 1609 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1610 } 1611 } 1612 1613 for (unsigned Reg : OutRegs) { 1614 // For now only track virtual registers. 1615 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1616 continue; 1617 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1618 for (; PSetI.isValid(); ++PSetI) { 1619 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1620 } 1621 } 1622 1623 return DiffSetPressure; 1624} 1625 1626// SIScheduler // 1627 1628struct SIScheduleBlockResult 1629SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1630 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1631 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1632 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1633 std::vector<SIScheduleBlock*> ScheduledBlocks; 1634 struct SIScheduleBlockResult Res; 1635 1636 ScheduledBlocks = Scheduler.getBlocks(); 1637 1638 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1639 SIScheduleBlock *Block = ScheduledBlocks[b]; 1640 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1641 1642 for (SUnit* SU : SUs) 1643 Res.SUs.push_back(SU->NodeNum); 1644 } 1645 1646 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1647 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1648 return Res; 1649} 1650 1651// SIScheduleDAGMI // 1652 1653SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1654 ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) { 1655 SITII = static_cast<const SIInstrInfo*>(TII); 1656 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1657 1658 VGPRSetID = SITRI->getVGPR32PressureSet(); 1659 SGPRSetID = SITRI->getSGPR32PressureSet(); 1660} 1661 1662SIScheduleDAGMI::~SIScheduleDAGMI() { 1663} 1664 1665ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { 1666 return new SIScheduleDAGMI(C); 1667} 1668 1669// Code adapted from scheduleDAG.cpp 1670// Does a topological sort over the SUs. 1671// Both TopDown and BottomUp 1672void SIScheduleDAGMI::topologicalSort() { 1673 Topo.InitDAGTopologicalSorting(); 1674 1675 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1676 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1677} 1678 1679// Move low latencies further from their user without 1680// increasing SGPR usage (in general) 1681// This is to be replaced by a better pass that would 1682// take into account SGPR usage (based on VGPR Usage 1683// and the corresponding wavefront count), that would 1684// try to merge groups of loads if it make sense, etc 1685void SIScheduleDAGMI::moveLowLatencies() { 1686 unsigned DAGSize = SUnits.size(); 1687 int LastLowLatencyUser = -1; 1688 int LastLowLatencyPos = -1; 1689 1690 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1691 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1692 bool IsLowLatencyUser = false; 1693 unsigned MinPos = 0; 1694 1695 for (SDep& PredDep : SU->Preds) { 1696 SUnit *Pred = PredDep.getSUnit(); 1697 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1698 IsLowLatencyUser = true; 1699 } 1700 if (Pred->NodeNum >= DAGSize) 1701 continue; 1702 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1703 if (PredPos >= MinPos) 1704 MinPos = PredPos + 1; 1705 } 1706 1707 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1708 unsigned BestPos = LastLowLatencyUser + 1; 1709 if ((int)BestPos <= LastLowLatencyPos) 1710 BestPos = LastLowLatencyPos + 1; 1711 if (BestPos < MinPos) 1712 BestPos = MinPos; 1713 if (BestPos < i) { 1714 for (unsigned u = i; u > BestPos; --u) { 1715 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1716 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1717 } 1718 ScheduledSUnits[BestPos] = SU->NodeNum; 1719 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1720 } 1721 LastLowLatencyPos = BestPos; 1722 if (IsLowLatencyUser) 1723 LastLowLatencyUser = BestPos; 1724 } else if (IsLowLatencyUser) { 1725 LastLowLatencyUser = i; 1726 // Moves COPY instructions on which depends 1727 // the low latency instructions too. 1728 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1729 bool CopyForLowLat = false; 1730 for (SDep& SuccDep : SU->Succs) { 1731 SUnit *Succ = SuccDep.getSUnit(); 1732 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1733 CopyForLowLat = true; 1734 } 1735 } 1736 if (!CopyForLowLat) 1737 continue; 1738 if (MinPos < i) { 1739 for (unsigned u = i; u > MinPos; --u) { 1740 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1741 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1742 } 1743 ScheduledSUnits[MinPos] = SU->NodeNum; 1744 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1745 } 1746 } 1747 } 1748} 1749 1750void SIScheduleDAGMI::restoreSULinksLeft() { 1751 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1752 SUnits[i].isScheduled = false; 1753 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1754 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1755 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1756 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1757 } 1758} 1759 1760// Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1761template<typename _Iterator> void 1762SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1763 unsigned &VgprUsage, unsigned &SgprUsage) { 1764 VgprUsage = 0; 1765 SgprUsage = 0; 1766 for (_Iterator RegI = First; RegI != End; ++RegI) { 1767 unsigned Reg = *RegI; 1768 // For now only track virtual registers 1769 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1770 continue; 1771 PSetIterator PSetI = MRI.getPressureSets(Reg); 1772 for (; PSetI.isValid(); ++PSetI) { 1773 if (*PSetI == VGPRSetID) 1774 VgprUsage += PSetI.getWeight(); 1775 else if (*PSetI == SGPRSetID) 1776 SgprUsage += PSetI.getWeight(); 1777 } 1778 } 1779} 1780 1781void SIScheduleDAGMI::schedule() 1782{ 1783 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1784 SIScheduleBlockResult Best, Temp; 1785 DEBUG(dbgs() << "Preparing Scheduling\n"); 1786 1787 buildDAGWithRegPressure(); 1788 DEBUG( 1789 for(SUnit& SU : SUnits) 1790 SU.dumpAll(this) 1791 ); 1792 1793 topologicalSort(); 1794 findRootsAndBiasEdges(TopRoots, BotRoots); 1795 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1796 // functions, but to make them happy we must initialize 1797 // the default Scheduler implementation (even if we do not 1798 // run it) 1799 SchedImpl->initialize(this); 1800 initQueues(TopRoots, BotRoots); 1801 1802 // Fill some stats to help scheduling. 1803 1804 SUnitsLinksBackup = SUnits; 1805 IsLowLatencySU.clear(); 1806 LowLatencyOffset.clear(); 1807 IsHighLatencySU.clear(); 1808 1809 IsLowLatencySU.resize(SUnits.size(), 0); 1810 LowLatencyOffset.resize(SUnits.size(), 0); 1811 IsHighLatencySU.resize(SUnits.size(), 0); 1812 1813 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1814 SUnit *SU = &SUnits[i]; 1815 unsigned BaseLatReg; 1816 int64_t OffLatReg; 1817 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1818 IsLowLatencySU[i] = 1; 1819 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, 1820 TRI)) 1821 LowLatencyOffset[i] = OffLatReg; 1822 } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) 1823 IsHighLatencySU[i] = 1; 1824 } 1825 1826 SIScheduler Scheduler(this); 1827 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1828 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1829 1830 // if VGPR usage is extremely high, try other good performing variants 1831 // which could lead to lower VGPR usage 1832 if (Best.MaxVGPRUsage > 180) { 1833 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { 1834 { LatenciesAlone, BlockRegUsageLatency }, 1835// { LatenciesAlone, BlockRegUsage }, 1836 { LatenciesGrouped, BlockLatencyRegUsage }, 1837// { LatenciesGrouped, BlockRegUsageLatency }, 1838// { LatenciesGrouped, BlockRegUsage }, 1839 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1840// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1841// { LatenciesAlonePlusConsecutive, BlockRegUsage } 1842 }; 1843 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1844 Temp = Scheduler.scheduleVariant(v.first, v.second); 1845 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1846 Best = Temp; 1847 } 1848 } 1849 // if VGPR usage is still extremely high, we may spill. Try other variants 1850 // which are less performing, but that could lead to lower VGPR usage. 1851 if (Best.MaxVGPRUsage > 200) { 1852 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { 1853// { LatenciesAlone, BlockRegUsageLatency }, 1854 { LatenciesAlone, BlockRegUsage }, 1855// { LatenciesGrouped, BlockLatencyRegUsage }, 1856 { LatenciesGrouped, BlockRegUsageLatency }, 1857 { LatenciesGrouped, BlockRegUsage }, 1858// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1859 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1860 { LatenciesAlonePlusConsecutive, BlockRegUsage } 1861 }; 1862 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1863 Temp = Scheduler.scheduleVariant(v.first, v.second); 1864 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1865 Best = Temp; 1866 } 1867 } 1868 1869 ScheduledSUnits = Best.SUs; 1870 ScheduledSUnitsInv.resize(SUnits.size()); 1871 1872 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1873 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 1874 } 1875 1876 moveLowLatencies(); 1877 1878 // Tell the outside world about the result of the scheduling. 1879 1880 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 1881 TopRPTracker.setPos(CurrentTop); 1882 1883 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 1884 E = ScheduledSUnits.end(); I != E; ++I) { 1885 SUnit *SU = &SUnits[*I]; 1886 1887 scheduleMI(SU, true); 1888 1889 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 1890 << *SU->getInstr()); 1891 } 1892 1893 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 1894 1895 placeDebugValues(); 1896 1897 DEBUG({ 1898 unsigned BBNum = begin()->getParent()->getNumber(); 1899 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 1900 dumpSchedule(); 1901 dbgs() << '\n'; 1902 }); 1903} 1904