R600EmitClauseMarkers.cpp revision 19a99df130f5747da950faf4ca5170d71f05594c
1//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold 12/// 128 Alu instructions ; these instructions can access up to 4 prefetched 13/// 4 lines of 16 registers from constant buffers. Such ALU clauses are 14/// initiated by CF_ALU instructions. 15//===----------------------------------------------------------------------===// 16 17#include "AMDGPU.h" 18#include "R600Defines.h" 19#include "R600InstrInfo.h" 20#include "R600MachineFunctionInfo.h" 21#include "R600RegisterInfo.h" 22#include "llvm/CodeGen/MachineFunctionPass.h" 23#include "llvm/CodeGen/MachineInstrBuilder.h" 24#include "llvm/CodeGen/MachineRegisterInfo.h" 25 26using namespace llvm; 27 28namespace { 29 30class R600EmitClauseMarkersPass : public MachineFunctionPass { 31 32private: 33 static char ID; 34 const R600InstrInfo *TII; 35 int Address; 36 37 unsigned OccupiedDwords(MachineInstr *MI) const { 38 switch (MI->getOpcode()) { 39 case AMDGPU::INTERP_PAIR_XY: 40 case AMDGPU::INTERP_PAIR_ZW: 41 case AMDGPU::INTERP_VEC_LOAD: 42 case AMDGPU::DOT_4: 43 return 4; 44 case AMDGPU::KILL: 45 return 0; 46 default: 47 break; 48 } 49 50 // These will be expanded to two ALU instructions in the 51 // ExpandSpecialInstructions pass. 52 if (TII->isLDSRetInstr(MI->getOpcode())) 53 return 2; 54 55 if(TII->isVector(*MI) || 56 TII->isCubeOp(MI->getOpcode()) || 57 TII->isReductionOp(MI->getOpcode())) 58 return 4; 59 60 unsigned NumLiteral = 0; 61 for (MachineInstr::mop_iterator It = MI->operands_begin(), 62 E = MI->operands_end(); It != E; ++It) { 63 MachineOperand &MO = *It; 64 if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 65 ++NumLiteral; 66 } 67 return 1 + NumLiteral; 68 } 69 70 bool isALU(const MachineInstr *MI) const { 71 if (TII->isALUInstr(MI->getOpcode())) 72 return true; 73 if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) 74 return true; 75 switch (MI->getOpcode()) { 76 case AMDGPU::PRED_X: 77 case AMDGPU::INTERP_PAIR_XY: 78 case AMDGPU::INTERP_PAIR_ZW: 79 case AMDGPU::INTERP_VEC_LOAD: 80 case AMDGPU::COPY: 81 case AMDGPU::DOT_4: 82 return true; 83 default: 84 return false; 85 } 86 } 87 88 bool IsTrivialInst(MachineInstr *MI) const { 89 switch (MI->getOpcode()) { 90 case AMDGPU::KILL: 91 case AMDGPU::RETURN: 92 case AMDGPU::IMPLICIT_DEF: 93 return true; 94 default: 95 return false; 96 } 97 } 98 99 std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { 100 // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 101 // (See also R600ISelLowering.cpp) 102 // ConstIndex value is in [0, 4095]; 103 return std::pair<unsigned, unsigned>( 104 ((Sel >> 2) - 512) >> 12, // KC_BANK 105 // Line Number of ConstIndex 106 // A line contains 16 constant registers however KCX bank can lock 107 // two line at the same time ; thus we want to get an even line number. 108 // Line number can be retrieved with (>>4), using (>>5) <<1 generates 109 // an even number. 110 ((((Sel >> 2) - 512) & 4095) >> 5) << 1); 111 } 112 113 bool SubstituteKCacheBank(MachineInstr *MI, 114 std::vector<std::pair<unsigned, unsigned> > &CachedConsts, 115 bool UpdateInstr = true) const { 116 std::vector<std::pair<unsigned, unsigned> > UsedKCache; 117 118 if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) 119 return true; 120 121 const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = 122 TII->getSrcs(MI); 123 assert((TII->isALUInstr(MI->getOpcode()) || 124 MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); 125 for (unsigned i = 0, n = Consts.size(); i < n; ++i) { 126 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 127 continue; 128 unsigned Sel = Consts[i].second; 129 unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; 130 unsigned KCacheIndex = Index * 4 + Chan; 131 const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); 132 if (CachedConsts.empty()) { 133 CachedConsts.push_back(BankLine); 134 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 135 continue; 136 } 137 if (CachedConsts[0] == BankLine) { 138 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 139 continue; 140 } 141 if (CachedConsts.size() == 1) { 142 CachedConsts.push_back(BankLine); 143 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 144 continue; 145 } 146 if (CachedConsts[1] == BankLine) { 147 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 148 continue; 149 } 150 return false; 151 } 152 153 if (!UpdateInstr) 154 return true; 155 156 for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { 157 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 158 continue; 159 switch(UsedKCache[j].first) { 160 case 0: 161 Consts[i].first->setReg( 162 AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); 163 break; 164 case 1: 165 Consts[i].first->setReg( 166 AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); 167 break; 168 default: 169 llvm_unreachable("Wrong Cache Line"); 170 } 171 j++; 172 } 173 return true; 174 } 175 176 bool canClauseLocalKillFitInClause( 177 unsigned AluInstCount, 178 std::vector<std::pair<unsigned, unsigned> > KCacheBanks, 179 MachineBasicBlock::iterator Def, 180 MachineBasicBlock::iterator BBEnd) { 181 const R600RegisterInfo &TRI = TII->getRegisterInfo(); 182 for (MachineInstr::const_mop_iterator 183 MOI = Def->operands_begin(), 184 MOE = Def->operands_end(); MOI != MOE; ++MOI) { 185 if (!MOI->isReg() || !MOI->isDef() || 186 TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) 187 continue; 188 189 // Def defines a clause local register, so check that its use will fit 190 // in the clause. 191 unsigned LastUseCount = 0; 192 for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { 193 AluInstCount += OccupiedDwords(UseI); 194 // Make sure we won't need to end the clause due to KCache limitations. 195 if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) 196 return false; 197 198 // We have reached the maximum instruction limit before finding the 199 // use that kills this register, so we cannot use this def in the 200 // current clause. 201 if (AluInstCount >= TII->getMaxAlusPerClause()) 202 return false; 203 204 // Register kill flags have been cleared by the time we get to this 205 // pass, but it is safe to assume that all uses of this register 206 // occur in the same basic block as its definition, because 207 // it is illegal for the scheduler to schedule them in 208 // different blocks. 209 if (UseI->findRegisterUseOperandIdx(MOI->getReg())) 210 LastUseCount = AluInstCount; 211 212 if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) 213 break; 214 } 215 if (LastUseCount) 216 return LastUseCount <= TII->getMaxAlusPerClause(); 217 llvm_unreachable("Clause local register live at end of clause."); 218 } 219 return true; 220 } 221 222 MachineBasicBlock::iterator 223 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { 224 MachineBasicBlock::iterator ClauseHead = I; 225 std::vector<std::pair<unsigned, unsigned> > KCacheBanks; 226 bool PushBeforeModifier = false; 227 unsigned AluInstCount = 0; 228 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 229 if (IsTrivialInst(I)) 230 continue; 231 if (!isALU(I)) 232 break; 233 if (AluInstCount > TII->getMaxAlusPerClause()) 234 break; 235 if (I->getOpcode() == AMDGPU::PRED_X) { 236 // We put PRED_X in its own clause to ensure that ifcvt won't create 237 // clauses with more than 128 insts. 238 // IfCvt is indeed checking that "then" and "else" branches of an if 239 // statement have less than ~60 insts thus converted clauses can't be 240 // bigger than ~121 insts (predicate setter needs to be in the same 241 // clause as predicated alus). 242 if (AluInstCount > 0) 243 break; 244 if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) 245 PushBeforeModifier = true; 246 AluInstCount ++; 247 continue; 248 } 249 // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: 250 // 251 // * KILL or INTERP instructions 252 // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits 253 // * Uses waterfalling (i.e. INDEX_MODE = AR.X) 254 // 255 // XXX: These checks have not been implemented yet. 256 if (TII->mustBeLastInClause(I->getOpcode())) { 257 I++; 258 break; 259 } 260 261 // If this instruction defines a clause local register, make sure 262 // its use can fit in this clause. 263 if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) 264 break; 265 266 if (!SubstituteKCacheBank(I, KCacheBanks)) 267 break; 268 AluInstCount += OccupiedDwords(I); 269 } 270 unsigned Opcode = PushBeforeModifier ? 271 AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; 272 BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) 273 // We don't use the ADDR field until R600ControlFlowFinalizer pass, where 274 // it is safe to assume it is 0. However if we always put 0 here, the ifcvt 275 // pass may assume that identical ALU clause starter at the beginning of a 276 // true and false branch can be factorized which is not the case. 277 .addImm(Address++) // ADDR 278 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 279 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 280 .addImm(KCacheBanks.empty()?0:2) // KM0 281 .addImm((KCacheBanks.size() < 2)?0:2) // KM1 282 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 283 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 284 .addImm(AluInstCount) // COUNT 285 .addImm(1); // Enabled 286 return I; 287 } 288 289public: 290 R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID), 291 TII(0), Address(0) { } 292 293 virtual bool runOnMachineFunction(MachineFunction &MF) { 294 TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); 295 296 for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); 297 BB != BB_E; ++BB) { 298 MachineBasicBlock &MBB = *BB; 299 MachineBasicBlock::iterator I = MBB.begin(); 300 if (I->getOpcode() == AMDGPU::CF_ALU) 301 continue; // BB was already parsed 302 for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { 303 if (isALU(I)) 304 I = MakeALUClause(MBB, I); 305 else 306 ++I; 307 } 308 } 309 return false; 310 } 311 312 const char *getPassName() const { 313 return "R600 Emit Clause Markers Pass"; 314 } 315}; 316 317char R600EmitClauseMarkersPass::ID = 0; 318 319} // end anonymous namespace 320 321 322llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) { 323 return new R600EmitClauseMarkersPass(TM); 324} 325 326