1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// The machine combiner pass uses machine trace metrics to ensure the combined 11// instructions does not lengthen the critical path or the resource depth. 12//===----------------------------------------------------------------------===// 13 14#define DEBUG_TYPE "machine-combiner" 15 16#include "llvm/ADT/Statistic.h" 17#include "llvm/ADT/DenseMap.h" 18#include "llvm/CodeGen/MachineDominators.h" 19#include "llvm/CodeGen/MachineFunction.h" 20#include "llvm/CodeGen/MachineFunctionPass.h" 21#include "llvm/CodeGen/MachineInstrBuilder.h" 22#include "llvm/CodeGen/MachineLoopInfo.h" 23#include "llvm/CodeGen/MachineRegisterInfo.h" 24#include "llvm/CodeGen/MachineTraceMetrics.h" 25#include "llvm/CodeGen/Passes.h" 26#include "llvm/CodeGen/TargetSchedule.h" 27#include "llvm/Support/CommandLine.h" 28#include "llvm/Support/Debug.h" 29#include "llvm/Support/raw_ostream.h" 30#include "llvm/Target/TargetInstrInfo.h" 31#include "llvm/Target/TargetRegisterInfo.h" 32#include "llvm/Target/TargetSubtargetInfo.h" 33 34using namespace llvm; 35 36STATISTIC(NumInstCombined, "Number of machineinst combined"); 37 38namespace { 39class MachineCombiner : public MachineFunctionPass { 40 const TargetInstrInfo *TII; 41 const TargetRegisterInfo *TRI; 42 MCSchedModel SchedModel; 43 MachineRegisterInfo *MRI; 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 const char *getPassName() const override { return "Machine InstCombiner"; } 60 61private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 75 MachineCombinerPattern Pattern); 76 bool preservesResourceLen(MachineBasicBlock *MBB, 77 MachineTraceMetrics::Trace BlockTrace, 78 SmallVectorImpl<MachineInstr *> &InsInstrs, 79 SmallVectorImpl<MachineInstr *> &DelInstrs); 80 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 81 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 82}; 83} 84 85char MachineCombiner::ID = 0; 86char &llvm::MachineCombinerID = MachineCombiner::ID; 87 88INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 89 "Machine InstCombiner", false, false) 90INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 91INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 92 false, false) 93 94void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 95 AU.setPreservesCFG(); 96 AU.addPreserved<MachineDominatorTree>(); 97 AU.addPreserved<MachineLoopInfo>(); 98 AU.addRequired<MachineTraceMetrics>(); 99 AU.addPreserved<MachineTraceMetrics>(); 100 MachineFunctionPass::getAnalysisUsage(AU); 101} 102 103MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 104 MachineInstr *DefInstr = nullptr; 105 // We need a virtual register definition. 106 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 107 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 108 // PHI's have no depth etc. 109 if (DefInstr && DefInstr->isPHI()) 110 DefInstr = nullptr; 111 return DefInstr; 112} 113 114/// Computes depth of instructions in vector \InsInstr. 115/// 116/// \param InsInstrs is a vector of machine instructions 117/// \param InstrIdxForVirtReg is a dense map of virtual register to index 118/// of defining machine instruction in \p InsInstrs 119/// \param BlockTrace is a trace of machine instructions 120/// 121/// \returns Depth of last instruction in \InsInstrs ("NewRoot") 122unsigned 123MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 124 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 125 MachineTraceMetrics::Trace BlockTrace) { 126 SmallVector<unsigned, 16> InstrDepth; 127 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 128 "Missing machine model\n"); 129 130 // For each instruction in the new sequence compute the depth based on the 131 // operands. Use the trace information when possible. For new operands which 132 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 133 for (auto *InstrPtr : InsInstrs) { // for each Use 134 unsigned IDepth = 0; 135 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 136 for (const MachineOperand &MO : InstrPtr->operands()) { 137 // Check for virtual register operand. 138 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 139 continue; 140 if (!MO.isUse()) 141 continue; 142 unsigned DepthOp = 0; 143 unsigned LatencyOp = 0; 144 DenseMap<unsigned, unsigned>::iterator II = 145 InstrIdxForVirtReg.find(MO.getReg()); 146 if (II != InstrIdxForVirtReg.end()) { 147 // Operand is new virtual register not in trace 148 assert(II->second < InstrDepth.size() && "Bad Index"); 149 MachineInstr *DefInstr = InsInstrs[II->second]; 150 assert(DefInstr && 151 "There must be a definition for a new virtual register"); 152 DepthOp = InstrDepth[II->second]; 153 LatencyOp = TSchedModel.computeOperandLatency( 154 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 155 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 156 } else { 157 MachineInstr *DefInstr = getOperandDef(MO); 158 if (DefInstr) { 159 DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; 160 LatencyOp = TSchedModel.computeOperandLatency( 161 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 162 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 163 } 164 } 165 IDepth = std::max(IDepth, DepthOp + LatencyOp); 166 } 167 InstrDepth.push_back(IDepth); 168 } 169 unsigned NewRootIdx = InsInstrs.size() - 1; 170 return InstrDepth[NewRootIdx]; 171} 172 173/// Computes instruction latency as max of latency of defined operands. 174/// 175/// \param Root is a machine instruction that could be replaced by NewRoot. 176/// It is used to compute a more accurate latency information for NewRoot in 177/// case there is a dependent instruction in the same trace (\p BlockTrace) 178/// \param NewRoot is the instruction for which the latency is computed 179/// \param BlockTrace is a trace of machine instructions 180/// 181/// \returns Latency of \p NewRoot 182unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 183 MachineTraceMetrics::Trace BlockTrace) { 184 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 185 "Missing machine model\n"); 186 187 // Check each definition in NewRoot and compute the latency 188 unsigned NewRootLatency = 0; 189 190 for (const MachineOperand &MO : NewRoot->operands()) { 191 // Check for virtual register operand. 192 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 193 continue; 194 if (!MO.isDef()) 195 continue; 196 // Get the first instruction that uses MO 197 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 198 RI++; 199 MachineInstr *UseMO = RI->getParent(); 200 unsigned LatencyOp = 0; 201 if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { 202 LatencyOp = TSchedModel.computeOperandLatency( 203 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 204 UseMO->findRegisterUseOperandIdx(MO.getReg())); 205 } else { 206 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 207 } 208 NewRootLatency = std::max(NewRootLatency, LatencyOp); 209 } 210 return NewRootLatency; 211} 212 213/// The combiner's goal may differ based on which pattern it is attempting 214/// to optimize. 215enum class CombinerObjective { 216 MustReduceDepth, // The data dependency chain must be improved. 217 Default // The critical path must not be lengthened. 218}; 219 220static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 221 // TODO: If C++ ever gets a real enum class, make this part of the 222 // MachineCombinerPattern class. 223 switch (P) { 224 case MachineCombinerPattern::REASSOC_AX_BY: 225 case MachineCombinerPattern::REASSOC_AX_YB: 226 case MachineCombinerPattern::REASSOC_XA_BY: 227 case MachineCombinerPattern::REASSOC_XA_YB: 228 return CombinerObjective::MustReduceDepth; 229 default: 230 return CombinerObjective::Default; 231 } 232} 233 234/// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 235/// The new code sequence ends in MI NewRoot. A necessary condition for the new 236/// sequence to replace the old sequence is that it cannot lengthen the critical 237/// path. The definition of "improve" may be restricted by specifying that the 238/// new path improves the data dependency chain (MustReduceDepth). 239bool MachineCombiner::improvesCriticalPathLen( 240 MachineBasicBlock *MBB, MachineInstr *Root, 241 MachineTraceMetrics::Trace BlockTrace, 242 SmallVectorImpl<MachineInstr *> &InsInstrs, 243 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 244 MachineCombinerPattern Pattern) { 245 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 246 "Missing machine model\n"); 247 // NewRoot is the last instruction in the \p InsInstrs vector. 248 unsigned NewRootIdx = InsInstrs.size() - 1; 249 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 250 251 // Get depth and latency of NewRoot and Root. 252 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 253 unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; 254 255 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 256 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 257 dbgs() << " RootDepth: " << RootDepth << "\n"); 258 259 // For a transform such as reassociation, the cost equation is 260 // conservatively calculated so that we must improve the depth (data 261 // dependency cycles) in the critical path to proceed with the transform. 262 // Being conservative also protects against inaccuracies in the underlying 263 // machine trace metrics and CPU models. 264 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 265 return NewRootDepth < RootDepth; 266 267 // A more flexible cost calculation for the critical path includes the slack 268 // of the original code sequence. This may allow the transform to proceed 269 // even if the instruction depths (data dependency cycles) become worse. 270 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 271 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 272 unsigned RootSlack = BlockTrace.getInstrSlack(Root); 273 274 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 275 dbgs() << " RootLatency: " << RootLatency << "\n"; 276 dbgs() << " RootSlack: " << RootSlack << "\n"; 277 dbgs() << " NewRootDepth + NewRootLatency = " 278 << NewRootDepth + NewRootLatency << "\n"; 279 dbgs() << " RootDepth + RootLatency + RootSlack = " 280 << RootDepth + RootLatency + RootSlack << "\n";); 281 282 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 283 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 284 285 return NewCycleCount <= OldCycleCount; 286} 287 288/// helper routine to convert instructions into SC 289void MachineCombiner::instr2instrSC( 290 SmallVectorImpl<MachineInstr *> &Instrs, 291 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 292 for (auto *InstrPtr : Instrs) { 293 unsigned Opc = InstrPtr->getOpcode(); 294 unsigned Idx = TII->get(Opc).getSchedClass(); 295 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 296 InstrsSC.push_back(SC); 297 } 298} 299 300/// True when the new instructions do not increase resource length 301bool MachineCombiner::preservesResourceLen( 302 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 303 SmallVectorImpl<MachineInstr *> &InsInstrs, 304 SmallVectorImpl<MachineInstr *> &DelInstrs) { 305 if (!TSchedModel.hasInstrSchedModel()) 306 return true; 307 308 // Compute current resource length 309 310 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 311 SmallVector <const MachineBasicBlock *, 1> MBBarr; 312 MBBarr.push_back(MBB); 313 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 314 315 // Deal with SC rather than Instructions. 316 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 317 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 318 319 instr2instrSC(InsInstrs, InsInstrsSC); 320 instr2instrSC(DelInstrs, DelInstrsSC); 321 322 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 323 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 324 325 // Compute new resource length. 326 unsigned ResLenAfterCombine = 327 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 328 329 DEBUG(dbgs() << "RESOURCE DATA: \n"; 330 dbgs() << " resource len before: " << ResLenBeforeCombine 331 << " after: " << ResLenAfterCombine << "\n";); 332 333 return ResLenAfterCombine <= ResLenBeforeCombine; 334} 335 336/// \returns true when new instruction sequence should be generated 337/// independent if it lengthens critical path or not 338bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 339 if (OptSize && (NewSize < OldSize)) 340 return true; 341 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 342 return true; 343 return false; 344} 345 346/// Substitute a slow code sequence with a faster one by 347/// evaluating instruction combining pattern. 348/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 349/// combining based on machine trace metrics. Only combine a sequence of 350/// instructions when this neither lengthens the critical path nor increases 351/// resource pressure. When optimizing for codesize always combine when the new 352/// sequence is shorter. 353bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 354 bool Changed = false; 355 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 356 357 auto BlockIter = MBB->begin(); 358 359 while (BlockIter != MBB->end()) { 360 auto &MI = *BlockIter++; 361 362 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 363 SmallVector<MachineCombinerPattern, 16> Patterns; 364 // The motivating example is: 365 // 366 // MUL Other MUL_op1 MUL_op2 Other 367 // \ / \ | / 368 // ADD/SUB => MADD/MSUB 369 // (=Root) (=NewRoot) 370 371 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 372 // usually beneficial for code size it unfortunately can hurt performance 373 // when the ADD is on the critical path, but the MUL is not. With the 374 // substitution the MUL becomes part of the critical path (in form of the 375 // MADD) and can lengthen it on architectures where the MADD latency is 376 // longer than the ADD latency. 377 // 378 // For each instruction we check if it can be the root of a combiner 379 // pattern. Then for each pattern the new code sequence in form of MI is 380 // generated and evaluated. When the efficiency criteria (don't lengthen 381 // critical path, don't use more resources) is met the new sequence gets 382 // hooked up into the basic block before the old sequence is removed. 383 // 384 // The algorithm does not try to evaluate all patterns and pick the best. 385 // This is only an artificial restriction though. In practice there is 386 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 387 // based on an internal cost heuristic. 388 389 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 390 continue; 391 392 for (auto P : Patterns) { 393 SmallVector<MachineInstr *, 16> InsInstrs; 394 SmallVector<MachineInstr *, 16> DelInstrs; 395 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 396 if (!MinInstr) 397 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 398 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 399 Traces->verifyAnalysis(); 400 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 401 InstrIdxForVirtReg); 402 unsigned NewInstCount = InsInstrs.size(); 403 unsigned OldInstCount = DelInstrs.size(); 404 // Found pattern, but did not generate alternative sequence. 405 // This can happen e.g. when an immediate could not be materialized 406 // in a single instruction. 407 if (!NewInstCount) 408 continue; 409 410 // Substitute when we optimize for codesize and the new sequence has 411 // fewer instructions OR 412 // the new sequence neither lengthens the critical path nor increases 413 // resource pressure. 414 if (doSubstitute(NewInstCount, OldInstCount) || 415 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 416 InstrIdxForVirtReg, P) && 417 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 418 for (auto *InstrPtr : InsInstrs) 419 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 420 for (auto *InstrPtr : DelInstrs) 421 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 422 423 Changed = true; 424 ++NumInstCombined; 425 426 Traces->invalidate(MBB); 427 Traces->verifyAnalysis(); 428 // Eagerly stop after the first pattern fires. 429 break; 430 } else { 431 // Cleanup instructions of the alternative code sequence. There is no 432 // use for them. 433 MachineFunction *MF = MBB->getParent(); 434 for (auto *InstrPtr : InsInstrs) 435 MF->DeleteMachineInstr(InstrPtr); 436 } 437 InstrIdxForVirtReg.clear(); 438 } 439 } 440 441 return Changed; 442} 443 444bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 445 const TargetSubtargetInfo &STI = MF.getSubtarget(); 446 TII = STI.getInstrInfo(); 447 TRI = STI.getRegisterInfo(); 448 SchedModel = STI.getSchedModel(); 449 TSchedModel.init(SchedModel, &STI, TII); 450 MRI = &MF.getRegInfo(); 451 Traces = &getAnalysis<MachineTraceMetrics>(); 452 MinInstr = nullptr; 453 OptSize = MF.getFunction()->optForSize(); 454 455 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 456 if (!TII->useMachineCombiner()) { 457 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 458 return false; 459 } 460 461 bool Changed = false; 462 463 // Try to combine instructions. 464 for (auto &MBB : MF) 465 Changed |= combineInstructions(&MBB); 466 467 return Changed; 468} 469