R600Packetizer.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// This pass implements instructions packetization for R600. It unsets isLast 12/// bit of instructions inside a bundle and substitutes src register with 13/// PreviousVector when applicable. 14// 15//===----------------------------------------------------------------------===// 16 17#define DEBUG_TYPE "packets" 18#include "llvm/Support/Debug.h" 19#include "AMDGPU.h" 20#include "R600InstrInfo.h" 21#include "llvm/CodeGen/DFAPacketizer.h" 22#include "llvm/CodeGen/MachineDominators.h" 23#include "llvm/CodeGen/MachineFunctionPass.h" 24#include "llvm/CodeGen/MachineLoopInfo.h" 25#include "llvm/CodeGen/Passes.h" 26#include "llvm/CodeGen/ScheduleDAG.h" 27#include "llvm/Support/raw_ostream.h" 28 29using namespace llvm; 30 31namespace { 32 33class R600Packetizer : public MachineFunctionPass { 34 35public: 36 static char ID; 37 R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} 38 39 void getAnalysisUsage(AnalysisUsage &AU) const { 40 AU.setPreservesCFG(); 41 AU.addRequired<MachineDominatorTree>(); 42 AU.addPreserved<MachineDominatorTree>(); 43 AU.addRequired<MachineLoopInfo>(); 44 AU.addPreserved<MachineLoopInfo>(); 45 MachineFunctionPass::getAnalysisUsage(AU); 46 } 47 48 const char *getPassName() const { 49 return "R600 Packetizer"; 50 } 51 52 bool runOnMachineFunction(MachineFunction &Fn); 53}; 54char R600Packetizer::ID = 0; 55 56class R600PacketizerList : public VLIWPacketizerList { 57 58private: 59 const R600InstrInfo *TII; 60 const R600RegisterInfo &TRI; 61 bool VLIW5; 62 bool ConsideredInstUsesAlreadyWrittenVectorElement; 63 64 unsigned getSlot(const MachineInstr *MI) const { 65 return TRI.getHWRegChan(MI->getOperand(0).getReg()); 66 } 67 68 /// \returns register to PV chan mapping for bundle/single instructions that 69 /// immediately precedes I. 70 DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) 71 const { 72 DenseMap<unsigned, unsigned> Result; 73 I--; 74 if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) 75 return Result; 76 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 77 if (I->isBundle()) 78 BI++; 79 int LastDstChan = -1; 80 do { 81 bool isTrans = false; 82 int BISlot = getSlot(BI); 83 if (LastDstChan >= BISlot) 84 isTrans = true; 85 LastDstChan = BISlot; 86 if (TII->isPredicated(BI)) 87 continue; 88 int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); 89 if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) 90 continue; 91 int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); 92 if (DstIdx == -1) { 93 continue; 94 } 95 unsigned Dst = BI->getOperand(DstIdx).getReg(); 96 if (isTrans || TII->isTransOnly(BI)) { 97 Result[Dst] = AMDGPU::PS; 98 continue; 99 } 100 if (BI->getOpcode() == AMDGPU::DOT4_r600 || 101 BI->getOpcode() == AMDGPU::DOT4_eg) { 102 Result[Dst] = AMDGPU::PV_X; 103 continue; 104 } 105 if (Dst == AMDGPU::OQAP) { 106 continue; 107 } 108 unsigned PVReg = 0; 109 switch (TRI.getHWRegChan(Dst)) { 110 case 0: 111 PVReg = AMDGPU::PV_X; 112 break; 113 case 1: 114 PVReg = AMDGPU::PV_Y; 115 break; 116 case 2: 117 PVReg = AMDGPU::PV_Z; 118 break; 119 case 3: 120 PVReg = AMDGPU::PV_W; 121 break; 122 default: 123 llvm_unreachable("Invalid Chan"); 124 } 125 Result[Dst] = PVReg; 126 } while ((++BI)->isBundledWithPred()); 127 return Result; 128 } 129 130 void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs) 131 const { 132 unsigned Ops[] = { 133 AMDGPU::OpName::src0, 134 AMDGPU::OpName::src1, 135 AMDGPU::OpName::src2 136 }; 137 for (unsigned i = 0; i < 3; i++) { 138 int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); 139 if (OperandIdx < 0) 140 continue; 141 unsigned Src = MI->getOperand(OperandIdx).getReg(); 142 const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); 143 if (It != PVs.end()) 144 MI->getOperand(OperandIdx).setReg(It->second); 145 } 146 } 147public: 148 // Ctor. 149 R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, 150 MachineDominatorTree &MDT) 151 : VLIWPacketizerList(MF, MLI, MDT, true), 152 TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())), 153 TRI(TII->getRegisterInfo()) { 154 VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); 155 } 156 157 // initPacketizerState - initialize some internal flags. 158 void initPacketizerState() { 159 ConsideredInstUsesAlreadyWrittenVectorElement = false; 160 } 161 162 // ignorePseudoInstruction - Ignore bundling of pseudo instructions. 163 bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) { 164 return false; 165 } 166 167 // isSoloInstruction - return true if instruction MI can not be packetized 168 // with any other instruction, which means that MI itself is a packet. 169 bool isSoloInstruction(MachineInstr *MI) { 170 if (TII->isVector(*MI)) 171 return true; 172 if (!TII->isALUInstr(MI->getOpcode())) 173 return true; 174 if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) 175 return true; 176 // XXX: This can be removed once the packetizer properly handles all the 177 // LDS instruction group restrictions. 178 if (TII->isLDSInstr(MI->getOpcode())) 179 return true; 180 return false; 181 } 182 183 // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ 184 // together. 185 bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { 186 MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); 187 if (getSlot(MII) == getSlot(MIJ)) 188 ConsideredInstUsesAlreadyWrittenVectorElement = true; 189 // Does MII and MIJ share the same pred_sel ? 190 int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), 191 OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); 192 unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, 193 PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; 194 if (PredI != PredJ) 195 return false; 196 if (SUJ->isSucc(SUI)) { 197 for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { 198 const SDep &Dep = SUJ->Succs[i]; 199 if (Dep.getSUnit() != SUI) 200 continue; 201 if (Dep.getKind() == SDep::Anti) 202 continue; 203 if (Dep.getKind() == SDep::Output) 204 if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) 205 continue; 206 return false; 207 } 208 } 209 210 bool ARDef = TII->definesAddressRegister(MII) || 211 TII->definesAddressRegister(MIJ); 212 bool ARUse = TII->usesAddressRegister(MII) || 213 TII->usesAddressRegister(MIJ); 214 if (ARDef && ARUse) 215 return false; 216 217 return true; 218 } 219 220 // isLegalToPruneDependencies - Is it legal to prune dependece between SUI 221 // and SUJ. 222 bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;} 223 224 void setIsLastBit(MachineInstr *MI, unsigned Bit) const { 225 unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); 226 MI->getOperand(LastOp).setImm(Bit); 227 } 228 229 bool isBundlableWithCurrentPMI(MachineInstr *MI, 230 const DenseMap<unsigned, unsigned> &PV, 231 std::vector<R600InstrInfo::BankSwizzle> &BS, 232 bool &isTransSlot) { 233 isTransSlot = TII->isTransOnly(MI); 234 assert (!isTransSlot || VLIW5); 235 236 // Is the dst reg sequence legal ? 237 if (!isTransSlot && !CurrentPacketMIs.empty()) { 238 if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { 239 if (ConsideredInstUsesAlreadyWrittenVectorElement && 240 !TII->isVectorOnly(MI) && VLIW5) { 241 isTransSlot = true; 242 DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); 243 } 244 else 245 return false; 246 } 247 } 248 249 // Are the Constants limitations met ? 250 CurrentPacketMIs.push_back(MI); 251 if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { 252 DEBUG( 253 dbgs() << "Couldn't pack :\n"; 254 MI->dump(); 255 dbgs() << "with the following packets :\n"; 256 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 257 CurrentPacketMIs[i]->dump(); 258 dbgs() << "\n"; 259 } 260 dbgs() << "because of Consts read limitations\n"; 261 ); 262 CurrentPacketMIs.pop_back(); 263 return false; 264 } 265 266 // Is there a BankSwizzle set that meet Read Port limitations ? 267 if (!TII->fitsReadPortLimitations(CurrentPacketMIs, 268 PV, BS, isTransSlot)) { 269 DEBUG( 270 dbgs() << "Couldn't pack :\n"; 271 MI->dump(); 272 dbgs() << "with the following packets :\n"; 273 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 274 CurrentPacketMIs[i]->dump(); 275 dbgs() << "\n"; 276 } 277 dbgs() << "because of Read port limitations\n"; 278 ); 279 CurrentPacketMIs.pop_back(); 280 return false; 281 } 282 283 // We cannot read LDS source registrs from the Trans slot. 284 if (isTransSlot && TII->readsLDSSrcReg(MI)) 285 return false; 286 287 CurrentPacketMIs.pop_back(); 288 return true; 289 } 290 291 MachineBasicBlock::iterator addToPacket(MachineInstr *MI) { 292 MachineBasicBlock::iterator FirstInBundle = 293 CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); 294 const DenseMap<unsigned, unsigned> &PV = 295 getPreviousVector(FirstInBundle); 296 std::vector<R600InstrInfo::BankSwizzle> BS; 297 bool isTransSlot; 298 299 if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { 300 for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { 301 MachineInstr *MI = CurrentPacketMIs[i]; 302 unsigned Op = TII->getOperandIdx(MI->getOpcode(), 303 AMDGPU::OpName::bank_swizzle); 304 MI->getOperand(Op).setImm(BS[i]); 305 } 306 unsigned Op = TII->getOperandIdx(MI->getOpcode(), 307 AMDGPU::OpName::bank_swizzle); 308 MI->getOperand(Op).setImm(BS.back()); 309 if (!CurrentPacketMIs.empty()) 310 setIsLastBit(CurrentPacketMIs.back(), 0); 311 substitutePV(MI, PV); 312 MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); 313 if (isTransSlot) { 314 endPacket(std::next(It)->getParent(), std::next(It)); 315 } 316 return It; 317 } 318 endPacket(MI->getParent(), MI); 319 if (TII->isTransOnly(MI)) 320 return MI; 321 return VLIWPacketizerList::addToPacket(MI); 322 } 323}; 324 325bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { 326 const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); 327 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); 328 MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); 329 330 // Instantiate the packetizer. 331 R600PacketizerList Packetizer(Fn, MLI, MDT); 332 333 // DFA state table should not be empty. 334 assert(Packetizer.getResourceTracker() && "Empty DFA table!"); 335 336 // 337 // Loop over all basic blocks and remove KILL pseudo-instructions 338 // These instructions confuse the dependence analysis. Consider: 339 // D0 = ... (Insn 0) 340 // R0 = KILL R0, D0 (Insn 1) 341 // R0 = ... (Insn 2) 342 // Here, Insn 1 will result in the dependence graph not emitting an output 343 // dependence between Insn 0 and Insn 2. This can lead to incorrect 344 // packetization 345 // 346 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 347 MBB != MBBe; ++MBB) { 348 MachineBasicBlock::iterator End = MBB->end(); 349 MachineBasicBlock::iterator MI = MBB->begin(); 350 while (MI != End) { 351 if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || 352 (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { 353 MachineBasicBlock::iterator DeleteMI = MI; 354 ++MI; 355 MBB->erase(DeleteMI); 356 End = MBB->end(); 357 continue; 358 } 359 ++MI; 360 } 361 } 362 363 // Loop over all of the basic blocks. 364 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 365 MBB != MBBe; ++MBB) { 366 // Find scheduling regions and schedule / packetize each region. 367 unsigned RemainingCount = MBB->size(); 368 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 369 RegionEnd != MBB->begin();) { 370 // The next region starts above the previous region. Look backward in the 371 // instruction stream until we find the nearest boundary. 372 MachineBasicBlock::iterator I = RegionEnd; 373 for(;I != MBB->begin(); --I, --RemainingCount) { 374 if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) 375 break; 376 } 377 I = MBB->begin(); 378 379 // Skip empty scheduling regions. 380 if (I == RegionEnd) { 381 RegionEnd = std::prev(RegionEnd); 382 --RemainingCount; 383 continue; 384 } 385 // Skip regions with one instruction. 386 if (I == std::prev(RegionEnd)) { 387 RegionEnd = std::prev(RegionEnd); 388 continue; 389 } 390 391 Packetizer.PacketizeMIs(MBB, I, RegionEnd); 392 RegionEnd = I; 393 } 394 } 395 396 return true; 397 398} 399 400} // end anonymous namespace 401 402llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { 403 return new R600Packetizer(tm); 404} 405