ScheduleDAGInstrs.cpp revision 79ce276083ced01256a0eb7d80731e4948ca6e87
1//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This implements the ScheduleDAGInstrs class, which implements re-scheduling 11// of MachineInstrs. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "sched-instrs" 16#include "llvm/CodeGen/MachineDominators.h" 17#include "llvm/CodeGen/MachineFunctionPass.h" 18#include "llvm/CodeGen/MachineLoopInfo.h" 19#include "llvm/CodeGen/MachineRegisterInfo.h" 20#include "llvm/CodeGen/ScheduleDAGInstrs.h" 21#include "llvm/CodeGen/PseudoSourceValue.h" 22#include "llvm/Target/TargetMachine.h" 23#include "llvm/Target/TargetInstrInfo.h" 24#include "llvm/Target/TargetRegisterInfo.h" 25#include "llvm/Target/TargetSubtarget.h" 26#include "llvm/Support/Compiler.h" 27#include "llvm/Support/Debug.h" 28#include "llvm/Support/raw_ostream.h" 29#include "llvm/ADT/SmallSet.h" 30#include <map> 31using namespace llvm; 32 33namespace { 34 class VISIBILITY_HIDDEN LoopDependencies { 35 const MachineLoopInfo &MLI; 36 const MachineDominatorTree &MDT; 37 38 public: 39 typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> > 40 LoopDeps; 41 LoopDeps Deps; 42 43 LoopDependencies(const MachineLoopInfo &mli, 44 const MachineDominatorTree &mdt) : 45 MLI(mli), MDT(mdt) {} 46 47 void VisitLoop(const MachineLoop *Loop) { 48 Deps.clear(); 49 MachineBasicBlock *Header = Loop->getHeader(); 50 SmallSet<unsigned, 8> LoopLiveIns; 51 for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(), 52 LE = Header->livein_end(); LI != LE; ++LI) 53 LoopLiveIns.insert(*LI); 54 55 const MachineDomTreeNode *Node = MDT.getNode(Header); 56 const MachineBasicBlock *MBB = Node->getBlock(); 57 assert(Loop->contains(MBB) && 58 "Loop does not contain header!"); 59 VisitRegion(Node, MBB, Loop, LoopLiveIns); 60 } 61 62 private: 63 void VisitRegion(const MachineDomTreeNode *Node, 64 const MachineBasicBlock *MBB, 65 const MachineLoop *Loop, 66 const SmallSet<unsigned, 8> &LoopLiveIns) { 67 unsigned Count = 0; 68 for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); 69 I != E; ++I, ++Count) { 70 const MachineInstr *MI = I; 71 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 72 const MachineOperand &MO = MI->getOperand(i); 73 if (!MO.isReg() || !MO.isUse()) 74 continue; 75 unsigned MOReg = MO.getReg(); 76 if (LoopLiveIns.count(MOReg)) 77 Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count))); 78 } 79 } 80 81 const std::vector<MachineDomTreeNode*> &Children = Node->getChildren(); 82 for (std::vector<MachineDomTreeNode*>::const_iterator I = 83 Children.begin(), E = Children.end(); I != E; ++I) { 84 const MachineDomTreeNode *ChildNode = *I; 85 MachineBasicBlock *ChildBlock = ChildNode->getBlock(); 86 if (Loop->contains(ChildBlock)) 87 VisitRegion(ChildNode, ChildBlock, Loop, LoopLiveIns); 88 } 89 } 90 }; 91} 92 93ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, 94 const MachineLoopInfo &mli, 95 const MachineDominatorTree &mdt) 96 : ScheduleDAG(mf), MLI(mli), MDT(mdt) {} 97 98void ScheduleDAGInstrs::BuildSchedGraph() { 99 SUnits.reserve(BB->size()); 100 101 // We build scheduling units by walking a block's instruction list from bottom 102 // to top. 103 104 // Remember where a generic side-effecting instruction is as we procede. If 105 // ChainMMO is null, this is assumed to have arbitrary side-effects. If 106 // ChainMMO is non-null, then Chain makes only a single memory reference. 107 SUnit *Chain = 0; 108 MachineMemOperand *ChainMMO = 0; 109 110 // Memory references to specific known memory locations are tracked so that 111 // they can be given more precise dependencies. 112 std::map<const Value *, SUnit *> MemDefs; 113 std::map<const Value *, std::vector<SUnit *> > MemUses; 114 115 // Terminators can perform control transfers, we we need to make sure that 116 // all the work of the block is done before the terminator. 117 SUnit *Terminator = 0; 118 119 LoopDependencies LoopRegs(MLI, MDT); 120 121 // Track which regs are live into a loop, to help guide back-edge-aware 122 // scheduling. 123 SmallSet<unsigned, 8> LoopLiveInRegs; 124 if (MachineLoop *ML = MLI.getLoopFor(BB)) 125 if (BB == ML->getLoopLatch()) { 126 MachineBasicBlock *Header = ML->getHeader(); 127 for (MachineBasicBlock::livein_iterator I = Header->livein_begin(), 128 E = Header->livein_end(); I != E; ++I) 129 LoopLiveInRegs.insert(*I); 130 LoopRegs.VisitLoop(ML); 131 } 132 133 // Check to see if the scheduler cares about latencies. 134 bool UnitLatencies = ForceUnitLatencies(); 135 136 // Ask the target if address-backscheduling is desirable, and if so how much. 137 unsigned SpecialAddressLatency = 138 TM.getSubtarget<TargetSubtarget>().getSpecialAddressLatency(); 139 140 for (MachineBasicBlock::iterator MII = BB->end(), MIE = BB->begin(); 141 MII != MIE; --MII) { 142 MachineInstr *MI = prior(MII); 143 const TargetInstrDesc &TID = MI->getDesc(); 144 SUnit *SU = NewSUnit(MI); 145 146 // Assign the Latency field of SU using target-provided information. 147 if (UnitLatencies) 148 SU->Latency = 1; 149 else 150 ComputeLatency(SU); 151 152 // Add register-based dependencies (data, anti, and output). 153 for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) { 154 const MachineOperand &MO = MI->getOperand(j); 155 if (!MO.isReg()) continue; 156 unsigned Reg = MO.getReg(); 157 if (Reg == 0) continue; 158 159 assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!"); 160 std::vector<SUnit *> &UseList = Uses[Reg]; 161 std::vector<SUnit *> &DefList = Defs[Reg]; 162 // Optionally add output and anti dependencies. 163 // TODO: Using a latency of 1 here assumes there's no cost for 164 // reusing registers. 165 SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; 166 for (unsigned i = 0, e = DefList.size(); i != e; ++i) { 167 SUnit *DefSU = DefList[i]; 168 if (DefSU != SU && 169 (Kind != SDep::Output || !MO.isDead() || 170 !DefSU->getInstr()->registerDefIsDead(Reg))) 171 DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/Reg)); 172 } 173 for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { 174 std::vector<SUnit *> &DefList = Defs[*Alias]; 175 for (unsigned i = 0, e = DefList.size(); i != e; ++i) { 176 SUnit *DefSU = DefList[i]; 177 if (DefSU != SU && 178 (Kind != SDep::Output || !MO.isDead() || 179 !DefSU->getInstr()->registerDefIsDead(Reg))) 180 DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/ *Alias)); 181 } 182 } 183 184 if (MO.isDef()) { 185 // Add any data dependencies. 186 unsigned DataLatency = SU->Latency; 187 for (unsigned i = 0, e = UseList.size(); i != e; ++i) { 188 SUnit *UseSU = UseList[i]; 189 if (UseSU != SU) { 190 unsigned LDataLatency = DataLatency; 191 // Optionally add in a special extra latency for nodes that 192 // feed addresses. 193 // TODO: Do this for register aliases too. 194 if (SpecialAddressLatency != 0 && !UnitLatencies) { 195 MachineInstr *UseMI = UseSU->getInstr(); 196 const TargetInstrDesc &UseTID = UseMI->getDesc(); 197 int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg); 198 assert(RegUseIndex >= 0 && "UseMI doesn's use register!"); 199 if ((UseTID.mayLoad() || UseTID.mayStore()) && 200 (unsigned)RegUseIndex < UseTID.getNumOperands() && 201 UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass()) 202 LDataLatency += SpecialAddressLatency; 203 } 204 UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg)); 205 } 206 } 207 for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { 208 std::vector<SUnit *> &UseList = Uses[*Alias]; 209 for (unsigned i = 0, e = UseList.size(); i != e; ++i) { 210 SUnit *UseSU = UseList[i]; 211 if (UseSU != SU) 212 UseSU->addPred(SDep(SU, SDep::Data, DataLatency, *Alias)); 213 } 214 } 215 216 // If a def is going to wrap back around to the top of the loop, 217 // backschedule it. 218 // TODO: Blocks in loops without terminators can benefit too. 219 if (!UnitLatencies && Terminator && DefList.empty()) { 220 LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg); 221 if (I != LoopRegs.Deps.end()) { 222 const MachineOperand *UseMO = I->second.first; 223 unsigned Count = I->second.second; 224 const MachineInstr *UseMI = UseMO->getParent(); 225 unsigned UseMOIdx = UseMO - &UseMI->getOperand(0); 226 const TargetInstrDesc &UseTID = UseMI->getDesc(); 227 // TODO: If we knew the total depth of the region here, we could 228 // handle the case where the whole loop is inside the region but 229 // is large enough that the isScheduleHigh trick isn't needed. 230 if (UseMOIdx < UseTID.getNumOperands()) { 231 // Currently, we only support scheduling regions consisting of 232 // single basic blocks. Check to see if the instruction is in 233 // the same region by checking to see if it has the same parent. 234 if (UseMI->getParent() != MI->getParent()) { 235 unsigned Latency = SU->Latency; 236 if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) 237 Latency += SpecialAddressLatency; 238 // This is a wild guess as to the portion of the latency which 239 // will be overlapped by work done outside the current 240 // scheduling region. 241 Latency -= std::min(Latency, Count); 242 // Add the artifical edge. 243 Terminator->addPred(SDep(SU, SDep::Order, Latency, 244 /*Reg=*/0, /*isNormalMemory=*/false, 245 /*isMustAlias=*/false, 246 /*isArtificial=*/true)); 247 } else if (SpecialAddressLatency > 0 && 248 UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) { 249 // The entire loop body is within the current scheduling region 250 // and the latency of this operation is assumed to be greater 251 // than the latency of the loop. 252 // TODO: Recursively mark data-edge predecessors as 253 // isScheduleHigh too. 254 SU->isScheduleHigh = true; 255 } 256 } 257 LoopRegs.Deps.erase(I); 258 } 259 } 260 261 UseList.clear(); 262 if (!MO.isDead()) 263 DefList.clear(); 264 DefList.push_back(SU); 265 } else { 266 UseList.push_back(SU); 267 } 268 } 269 270 // Add chain dependencies. 271 // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable 272 // after stack slots are lowered to actual addresses. 273 // TODO: Use an AliasAnalysis and do real alias-analysis queries, and 274 // produce more precise dependence information. 275 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) { 276 new_chain: 277 // This is the conservative case. Add dependencies on all memory 278 // references. 279 if (Chain) 280 Chain->addPred(SDep(SU, SDep::Order, SU->Latency)); 281 Chain = SU; 282 for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) 283 PendingLoads[k]->addPred(SDep(SU, SDep::Order, SU->Latency)); 284 PendingLoads.clear(); 285 for (std::map<const Value *, SUnit *>::iterator I = MemDefs.begin(), 286 E = MemDefs.end(); I != E; ++I) { 287 I->second->addPred(SDep(SU, SDep::Order, SU->Latency)); 288 I->second = SU; 289 } 290 for (std::map<const Value *, std::vector<SUnit *> >::iterator I = 291 MemUses.begin(), E = MemUses.end(); I != E; ++I) { 292 for (unsigned i = 0, e = I->second.size(); i != e; ++i) 293 I->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency)); 294 I->second.clear(); 295 } 296 // See if it is known to just have a single memory reference. 297 MachineInstr *ChainMI = Chain->getInstr(); 298 const TargetInstrDesc &ChainTID = ChainMI->getDesc(); 299 if (!ChainTID.isCall() && !ChainTID.isTerminator() && 300 !ChainTID.hasUnmodeledSideEffects() && 301 ChainMI->hasOneMemOperand() && 302 !ChainMI->memoperands_begin()->isVolatile() && 303 ChainMI->memoperands_begin()->getValue()) 304 // We know that the Chain accesses one specific memory location. 305 ChainMMO = &*ChainMI->memoperands_begin(); 306 else 307 // Unknown memory accesses. Assume the worst. 308 ChainMMO = 0; 309 } else if (TID.mayStore()) { 310 if (MI->hasOneMemOperand() && 311 MI->memoperands_begin()->getValue() && 312 !MI->memoperands_begin()->isVolatile() && 313 isa<PseudoSourceValue>(MI->memoperands_begin()->getValue())) { 314 // A store to a specific PseudoSourceValue. Add precise dependencies. 315 const Value *V = MI->memoperands_begin()->getValue(); 316 // Handle the def in MemDefs, if there is one. 317 std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V); 318 if (I != MemDefs.end()) { 319 I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0, 320 /*isNormalMemory=*/true)); 321 I->second = SU; 322 } else { 323 MemDefs[V] = SU; 324 } 325 // Handle the uses in MemUses, if there are any. 326 std::map<const Value *, std::vector<SUnit *> >::iterator J = 327 MemUses.find(V); 328 if (J != MemUses.end()) { 329 for (unsigned i = 0, e = J->second.size(); i != e; ++i) 330 J->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0, 331 /*isNormalMemory=*/true)); 332 J->second.clear(); 333 } 334 // Add a general dependence too, if needed. 335 if (Chain) 336 Chain->addPred(SDep(SU, SDep::Order, SU->Latency)); 337 } else 338 // Treat all other stores conservatively. 339 goto new_chain; 340 } else if (TID.mayLoad()) { 341 if (TII->isInvariantLoad(MI)) { 342 // Invariant load, no chain dependencies needed! 343 } else if (MI->hasOneMemOperand() && 344 MI->memoperands_begin()->getValue() && 345 !MI->memoperands_begin()->isVolatile() && 346 isa<PseudoSourceValue>(MI->memoperands_begin()->getValue())) { 347 // A load from a specific PseudoSourceValue. Add precise dependencies. 348 const Value *V = MI->memoperands_begin()->getValue(); 349 std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V); 350 if (I != MemDefs.end()) 351 I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0, 352 /*isNormalMemory=*/true)); 353 MemUses[V].push_back(SU); 354 355 // Add a general dependence too, if needed. 356 if (Chain && (!ChainMMO || 357 (ChainMMO->isStore() || ChainMMO->isVolatile()))) 358 Chain->addPred(SDep(SU, SDep::Order, SU->Latency)); 359 } else if (MI->hasVolatileMemoryRef()) { 360 // Treat volatile loads conservatively. Note that this includes 361 // cases where memoperand information is unavailable. 362 goto new_chain; 363 } else { 364 // A normal load. Just depend on the general chain. 365 if (Chain) 366 Chain->addPred(SDep(SU, SDep::Order, SU->Latency)); 367 PendingLoads.push_back(SU); 368 } 369 } 370 371 // Add chain edges from the terminator to ensure that all the work of the 372 // block is completed before any control transfers. 373 if (Terminator && SU->Succs.empty()) 374 Terminator->addPred(SDep(SU, SDep::Order, SU->Latency)); 375 if (TID.isTerminator() || MI->isLabel()) 376 Terminator = SU; 377 } 378 379 for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) { 380 Defs[i].clear(); 381 Uses[i].clear(); 382 } 383 PendingLoads.clear(); 384} 385 386void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) { 387 const InstrItineraryData &InstrItins = TM.getInstrItineraryData(); 388 389 // Compute the latency for the node. We use the sum of the latencies for 390 // all nodes flagged together into this SUnit. 391 SU->Latency = 392 InstrItins.getLatency(SU->getInstr()->getDesc().getSchedClass()); 393 394 // Simplistic target-independent heuristic: assume that loads take 395 // extra time. 396 if (InstrItins.isEmpty()) 397 if (SU->getInstr()->getDesc().mayLoad()) 398 SU->Latency += 2; 399} 400 401void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { 402 SU->getInstr()->dump(); 403} 404 405std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const { 406 std::string s; 407 raw_string_ostream oss(s); 408 SU->getInstr()->print(oss); 409 return oss.str(); 410} 411 412// EmitSchedule - Emit the machine code in scheduled order. 413MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() { 414 // For MachineInstr-based scheduling, we're rescheduling the instructions in 415 // the block, so start by removing them from the block. 416 while (!BB->empty()) 417 BB->remove(BB->begin()); 418 419 // Then re-insert them according to the given schedule. 420 for (unsigned i = 0, e = Sequence.size(); i != e; i++) { 421 SUnit *SU = Sequence[i]; 422 if (!SU) { 423 // Null SUnit* is a noop. 424 EmitNoop(); 425 continue; 426 } 427 428 BB->push_back(SU->getInstr()); 429 } 430 431 return BB; 432} 433