R600ControlFlowFinalizer.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// This pass compute turns all control flow pseudo instructions into native one 12/// computing their address on the fly ; it also sets STACK_SIZE info. 13//===----------------------------------------------------------------------===// 14 15#include "llvm/Support/Debug.h" 16#include "AMDGPU.h" 17#include "R600Defines.h" 18#include "R600InstrInfo.h" 19#include "R600MachineFunctionInfo.h" 20#include "R600RegisterInfo.h" 21#include "llvm/CodeGen/MachineFunctionPass.h" 22#include "llvm/CodeGen/MachineInstrBuilder.h" 23#include "llvm/CodeGen/MachineRegisterInfo.h" 24#include "llvm/Support/raw_ostream.h" 25 26using namespace llvm; 27 28#define DEBUG_TYPE "r600cf" 29 30namespace { 31 32struct CFStack { 33 34 enum StackItem { 35 ENTRY = 0, 36 SUB_ENTRY = 1, 37 FIRST_NON_WQM_PUSH = 2, 38 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 39 }; 40 41 const AMDGPUSubtarget &ST; 42 std::vector<StackItem> BranchStack; 43 std::vector<StackItem> LoopStack; 44 unsigned MaxStackSize; 45 unsigned CurrentEntries; 46 unsigned CurrentSubEntries; 47 48 CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), 49 // We need to reserve a stack entry for CALL_FS in vertex shaders. 50 MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), 51 CurrentEntries(0), CurrentSubEntries(0) { } 52 53 unsigned getLoopDepth(); 54 bool branchStackContains(CFStack::StackItem); 55 bool requiresWorkAroundForInst(unsigned Opcode); 56 unsigned getSubEntrySize(CFStack::StackItem Item); 57 void updateMaxStackSize(); 58 void pushBranch(unsigned Opcode, bool isWQM = false); 59 void pushLoop(); 60 void popBranch(); 61 void popLoop(); 62}; 63 64unsigned CFStack::getLoopDepth() { 65 return LoopStack.size(); 66} 67 68bool CFStack::branchStackContains(CFStack::StackItem Item) { 69 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), 70 E = BranchStack.end(); I != E; ++I) { 71 if (*I == Item) 72 return true; 73 } 74 return false; 75} 76 77bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { 78 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && 79 getLoopDepth() > 1) 80 return true; 81 82 if (!ST.hasCFAluBug()) 83 return false; 84 85 switch(Opcode) { 86 default: return false; 87 case AMDGPU::CF_ALU_PUSH_BEFORE: 88 case AMDGPU::CF_ALU_ELSE_AFTER: 89 case AMDGPU::CF_ALU_BREAK: 90 case AMDGPU::CF_ALU_CONTINUE: 91 if (CurrentSubEntries == 0) 92 return false; 93 if (ST.getWavefrontSize() == 64) { 94 // We are being conservative here. We only require this work-around if 95 // CurrentSubEntries > 3 && 96 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) 97 // 98 // We have to be conservative, because we don't know for certain that 99 // our stack allocation algorithm for Evergreen/NI is correct. Applying this 100 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack 101 // resources without any problems. 102 return CurrentSubEntries > 3; 103 } else { 104 assert(ST.getWavefrontSize() == 32); 105 // We are being conservative here. We only require the work-around if 106 // CurrentSubEntries > 7 && 107 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) 108 // See the comment on the wavefront size == 64 case for why we are 109 // being conservative. 110 return CurrentSubEntries > 7; 111 } 112 } 113} 114 115unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { 116 switch(Item) { 117 default: 118 return 0; 119 case CFStack::FIRST_NON_WQM_PUSH: 120 assert(!ST.hasCaymanISA()); 121 if (ST.getGeneration() <= AMDGPUSubtarget::R700) { 122 // +1 For the push operation. 123 // +2 Extra space required. 124 return 3; 125 } else { 126 // Some documentation says that this is not necessary on Evergreen, 127 // but experimentation has show that we need to allocate 1 extra 128 // sub-entry for the first non-WQM push. 129 // +1 For the push operation. 130 // +1 Extra space required. 131 return 2; 132 } 133 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: 134 assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); 135 // +1 For the push operation. 136 // +1 Extra space required. 137 return 2; 138 case CFStack::SUB_ENTRY: 139 return 1; 140 } 141} 142 143void CFStack::updateMaxStackSize() { 144 unsigned CurrentStackSize = CurrentEntries + 145 (RoundUpToAlignment(CurrentSubEntries, 4) / 4); 146 MaxStackSize = std::max(CurrentStackSize, MaxStackSize); 147} 148 149void CFStack::pushBranch(unsigned Opcode, bool isWQM) { 150 CFStack::StackItem Item = CFStack::ENTRY; 151 switch(Opcode) { 152 case AMDGPU::CF_PUSH_EG: 153 case AMDGPU::CF_ALU_PUSH_BEFORE: 154 if (!isWQM) { 155 if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) 156 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI 157 // See comment in 158 // CFStack::getSubEntrySize() 159 else if (CurrentEntries > 0 && 160 ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && 161 !ST.hasCaymanISA() && 162 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) 163 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; 164 else 165 Item = CFStack::SUB_ENTRY; 166 } else 167 Item = CFStack::ENTRY; 168 break; 169 } 170 BranchStack.push_back(Item); 171 if (Item == CFStack::ENTRY) 172 CurrentEntries++; 173 else 174 CurrentSubEntries += getSubEntrySize(Item); 175 updateMaxStackSize(); 176} 177 178void CFStack::pushLoop() { 179 LoopStack.push_back(CFStack::ENTRY); 180 CurrentEntries++; 181 updateMaxStackSize(); 182} 183 184void CFStack::popBranch() { 185 CFStack::StackItem Top = BranchStack.back(); 186 if (Top == CFStack::ENTRY) 187 CurrentEntries--; 188 else 189 CurrentSubEntries-= getSubEntrySize(Top); 190 BranchStack.pop_back(); 191} 192 193void CFStack::popLoop() { 194 CurrentEntries--; 195 LoopStack.pop_back(); 196} 197 198class R600ControlFlowFinalizer : public MachineFunctionPass { 199 200private: 201 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; 202 203 enum ControlFlowInstruction { 204 CF_TC, 205 CF_VC, 206 CF_CALL_FS, 207 CF_WHILE_LOOP, 208 CF_END_LOOP, 209 CF_LOOP_BREAK, 210 CF_LOOP_CONTINUE, 211 CF_JUMP, 212 CF_ELSE, 213 CF_POP, 214 CF_END 215 }; 216 217 static char ID; 218 const R600InstrInfo *TII; 219 const R600RegisterInfo *TRI; 220 unsigned MaxFetchInst; 221 const AMDGPUSubtarget &ST; 222 223 bool IsTrivialInst(MachineInstr *MI) const { 224 switch (MI->getOpcode()) { 225 case AMDGPU::KILL: 226 case AMDGPU::RETURN: 227 return true; 228 default: 229 return false; 230 } 231 } 232 233 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 234 unsigned Opcode = 0; 235 bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); 236 switch (CFI) { 237 case CF_TC: 238 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 239 break; 240 case CF_VC: 241 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 242 break; 243 case CF_CALL_FS: 244 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 245 break; 246 case CF_WHILE_LOOP: 247 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 248 break; 249 case CF_END_LOOP: 250 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 251 break; 252 case CF_LOOP_BREAK: 253 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 254 break; 255 case CF_LOOP_CONTINUE: 256 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 257 break; 258 case CF_JUMP: 259 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 260 break; 261 case CF_ELSE: 262 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 263 break; 264 case CF_POP: 265 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 266 break; 267 case CF_END: 268 if (ST.hasCaymanISA()) { 269 Opcode = AMDGPU::CF_END_CM; 270 break; 271 } 272 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 273 break; 274 } 275 assert (Opcode && "No opcode selected"); 276 return TII->get(Opcode); 277 } 278 279 bool isCompatibleWithClause(const MachineInstr *MI, 280 std::set<unsigned> &DstRegs) const { 281 unsigned DstMI, SrcMI; 282 for (MachineInstr::const_mop_iterator I = MI->operands_begin(), 283 E = MI->operands_end(); I != E; ++I) { 284 const MachineOperand &MO = *I; 285 if (!MO.isReg()) 286 continue; 287 if (MO.isDef()) { 288 unsigned Reg = MO.getReg(); 289 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 290 DstMI = Reg; 291 else 292 DstMI = TRI->getMatchingSuperReg(Reg, 293 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 294 &AMDGPU::R600_Reg128RegClass); 295 } 296 if (MO.isUse()) { 297 unsigned Reg = MO.getReg(); 298 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 299 SrcMI = Reg; 300 else 301 SrcMI = TRI->getMatchingSuperReg(Reg, 302 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 303 &AMDGPU::R600_Reg128RegClass); 304 } 305 } 306 if ((DstRegs.find(SrcMI) == DstRegs.end())) { 307 DstRegs.insert(DstMI); 308 return true; 309 } else 310 return false; 311 } 312 313 ClauseFile 314 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 315 const { 316 MachineBasicBlock::iterator ClauseHead = I; 317 std::vector<MachineInstr *> ClauseContent; 318 unsigned AluInstCount = 0; 319 bool IsTex = TII->usesTextureCache(ClauseHead); 320 std::set<unsigned> DstRegs; 321 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 322 if (IsTrivialInst(I)) 323 continue; 324 if (AluInstCount >= MaxFetchInst) 325 break; 326 if ((IsTex && !TII->usesTextureCache(I)) || 327 (!IsTex && !TII->usesVertexCache(I))) 328 break; 329 if (!isCompatibleWithClause(I, DstRegs)) 330 break; 331 AluInstCount ++; 332 ClauseContent.push_back(I); 333 } 334 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 335 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 336 .addImm(0) // ADDR 337 .addImm(AluInstCount - 1); // COUNT 338 return ClauseFile(MIb, ClauseContent); 339 } 340 341 void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { 342 static const unsigned LiteralRegs[] = { 343 AMDGPU::ALU_LITERAL_X, 344 AMDGPU::ALU_LITERAL_Y, 345 AMDGPU::ALU_LITERAL_Z, 346 AMDGPU::ALU_LITERAL_W 347 }; 348 const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = 349 TII->getSrcs(MI); 350 for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { 351 if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) 352 continue; 353 int64_t Imm = Srcs[i].second; 354 std::vector<int64_t>::iterator It = 355 std::find(Lits.begin(), Lits.end(), Imm); 356 if (It != Lits.end()) { 357 unsigned Index = It - Lits.begin(); 358 Srcs[i].first->setReg(LiteralRegs[Index]); 359 } else { 360 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 361 Srcs[i].first->setReg(LiteralRegs[Lits.size()]); 362 Lits.push_back(Imm); 363 } 364 } 365 } 366 367 MachineBasicBlock::iterator insertLiterals( 368 MachineBasicBlock::iterator InsertPos, 369 const std::vector<unsigned> &Literals) const { 370 MachineBasicBlock *MBB = InsertPos->getParent(); 371 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 372 unsigned LiteralPair0 = Literals[i]; 373 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 374 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 375 TII->get(AMDGPU::LITERALS)) 376 .addImm(LiteralPair0) 377 .addImm(LiteralPair1); 378 } 379 return InsertPos; 380 } 381 382 ClauseFile 383 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 384 const { 385 MachineBasicBlock::iterator ClauseHead = I; 386 std::vector<MachineInstr *> ClauseContent; 387 I++; 388 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 389 if (IsTrivialInst(I)) { 390 ++I; 391 continue; 392 } 393 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 394 break; 395 std::vector<int64_t> Literals; 396 if (I->isBundle()) { 397 MachineInstr *DeleteMI = I; 398 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 399 while (++BI != E && BI->isBundledWithPred()) { 400 BI->unbundleFromPred(); 401 for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { 402 MachineOperand &MO = BI->getOperand(i); 403 if (MO.isReg() && MO.isInternalRead()) 404 MO.setIsInternalRead(false); 405 } 406 getLiteral(BI, Literals); 407 ClauseContent.push_back(BI); 408 } 409 I = BI; 410 DeleteMI->eraseFromParent(); 411 } else { 412 getLiteral(I, Literals); 413 ClauseContent.push_back(I); 414 I++; 415 } 416 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 417 unsigned literal0 = Literals[i]; 418 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; 419 MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), 420 TII->get(AMDGPU::LITERALS)) 421 .addImm(literal0) 422 .addImm(literal2); 423 ClauseContent.push_back(MILit); 424 } 425 } 426 assert(ClauseContent.size() < 128 && "ALU clause is too big"); 427 ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); 428 return ClauseFile(ClauseHead, ClauseContent); 429 } 430 431 void 432 EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 433 unsigned &CfCount) { 434 CounterPropagateAddr(Clause.first, CfCount); 435 MachineBasicBlock *BB = Clause.first->getParent(); 436 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) 437 .addImm(CfCount); 438 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 439 BB->splice(InsertPos, BB, Clause.second[i]); 440 } 441 CfCount += 2 * Clause.second.size(); 442 } 443 444 void 445 EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 446 unsigned &CfCount) { 447 Clause.first->getOperand(0).setImm(0); 448 CounterPropagateAddr(Clause.first, CfCount); 449 MachineBasicBlock *BB = Clause.first->getParent(); 450 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) 451 .addImm(CfCount); 452 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 453 BB->splice(InsertPos, BB, Clause.second[i]); 454 } 455 CfCount += Clause.second.size(); 456 } 457 458 void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { 459 MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); 460 } 461 void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) 462 const { 463 for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); 464 It != E; ++It) { 465 MachineInstr *MI = *It; 466 CounterPropagateAddr(MI, Addr); 467 } 468 } 469 470public: 471 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), 472 TII (nullptr), TRI(nullptr), 473 ST(tm.getSubtarget<AMDGPUSubtarget>()) { 474 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); 475 MaxFetchInst = ST.getTexVTXClauseSize(); 476 } 477 478 bool runOnMachineFunction(MachineFunction &MF) override { 479 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); 480 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); 481 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 482 483 CFStack CFStack(ST, MFI->ShaderType); 484 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 485 ++MB) { 486 MachineBasicBlock &MBB = *MB; 487 unsigned CfCount = 0; 488 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; 489 std::vector<MachineInstr * > IfThenElseStack; 490 if (MFI->ShaderType == 1) { 491 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 492 getHWInstrDesc(CF_CALL_FS)); 493 CfCount++; 494 } 495 std::vector<ClauseFile> FetchClauses, AluClauses; 496 std::vector<MachineInstr *> LastAlu(1); 497 std::vector<MachineInstr *> ToPopAfter; 498 499 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 500 I != E;) { 501 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { 502 DEBUG(dbgs() << CfCount << ":"; I->dump();); 503 FetchClauses.push_back(MakeFetchClause(MBB, I)); 504 CfCount++; 505 LastAlu.back() = nullptr; 506 continue; 507 } 508 509 MachineBasicBlock::iterator MI = I; 510 if (MI->getOpcode() != AMDGPU::ENDIF) 511 LastAlu.back() = nullptr; 512 if (MI->getOpcode() == AMDGPU::CF_ALU) 513 LastAlu.back() = MI; 514 I++; 515 bool RequiresWorkAround = 516 CFStack.requiresWorkAroundForInst(MI->getOpcode()); 517 switch (MI->getOpcode()) { 518 case AMDGPU::CF_ALU_PUSH_BEFORE: 519 if (RequiresWorkAround) { 520 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); 521 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) 522 .addImm(CfCount + 1) 523 .addImm(1); 524 MI->setDesc(TII->get(AMDGPU::CF_ALU)); 525 CfCount++; 526 CFStack.pushBranch(AMDGPU::CF_PUSH_EG); 527 } else 528 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); 529 530 case AMDGPU::CF_ALU: 531 I = MI; 532 AluClauses.push_back(MakeALUClause(MBB, I)); 533 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 534 CfCount++; 535 break; 536 case AMDGPU::WHILELOOP: { 537 CFStack.pushLoop(); 538 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 539 getHWInstrDesc(CF_WHILE_LOOP)) 540 .addImm(1); 541 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, 542 std::set<MachineInstr *>()); 543 Pair.second.insert(MIb); 544 LoopStack.push_back(Pair); 545 MI->eraseFromParent(); 546 CfCount++; 547 break; 548 } 549 case AMDGPU::ENDLOOP: { 550 CFStack.popLoop(); 551 std::pair<unsigned, std::set<MachineInstr *> > Pair = 552 LoopStack.back(); 553 LoopStack.pop_back(); 554 CounterPropagateAddr(Pair.second, CfCount); 555 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 556 .addImm(Pair.first + 1); 557 MI->eraseFromParent(); 558 CfCount++; 559 break; 560 } 561 case AMDGPU::IF_PREDICATE_SET: { 562 LastAlu.push_back(nullptr); 563 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 564 getHWInstrDesc(CF_JUMP)) 565 .addImm(0) 566 .addImm(0); 567 IfThenElseStack.push_back(MIb); 568 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 569 MI->eraseFromParent(); 570 CfCount++; 571 break; 572 } 573 case AMDGPU::ELSE: { 574 MachineInstr * JumpInst = IfThenElseStack.back(); 575 IfThenElseStack.pop_back(); 576 CounterPropagateAddr(JumpInst, CfCount); 577 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 578 getHWInstrDesc(CF_ELSE)) 579 .addImm(0) 580 .addImm(0); 581 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 582 IfThenElseStack.push_back(MIb); 583 MI->eraseFromParent(); 584 CfCount++; 585 break; 586 } 587 case AMDGPU::ENDIF: { 588 CFStack.popBranch(); 589 if (LastAlu.back()) { 590 ToPopAfter.push_back(LastAlu.back()); 591 } else { 592 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 593 getHWInstrDesc(CF_POP)) 594 .addImm(CfCount + 1) 595 .addImm(1); 596 (void)MIb; 597 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 598 CfCount++; 599 } 600 601 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 602 IfThenElseStack.pop_back(); 603 CounterPropagateAddr(IfOrElseInst, CfCount); 604 IfOrElseInst->getOperand(1).setImm(1); 605 LastAlu.pop_back(); 606 MI->eraseFromParent(); 607 break; 608 } 609 case AMDGPU::BREAK: { 610 CfCount ++; 611 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 612 getHWInstrDesc(CF_LOOP_BREAK)) 613 .addImm(0); 614 LoopStack.back().second.insert(MIb); 615 MI->eraseFromParent(); 616 break; 617 } 618 case AMDGPU::CONTINUE: { 619 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 620 getHWInstrDesc(CF_LOOP_CONTINUE)) 621 .addImm(0); 622 LoopStack.back().second.insert(MIb); 623 MI->eraseFromParent(); 624 CfCount++; 625 break; 626 } 627 case AMDGPU::RETURN: { 628 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); 629 CfCount++; 630 MI->eraseFromParent(); 631 if (CfCount % 2) { 632 BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); 633 CfCount++; 634 } 635 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 636 EmitFetchClause(I, FetchClauses[i], CfCount); 637 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 638 EmitALUClause(I, AluClauses[i], CfCount); 639 } 640 default: 641 if (TII->isExport(MI->getOpcode())) { 642 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 643 CfCount++; 644 } 645 break; 646 } 647 } 648 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { 649 MachineInstr *Alu = ToPopAfter[i]; 650 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), 651 TII->get(AMDGPU::CF_ALU_POP_AFTER)) 652 .addImm(Alu->getOperand(0).getImm()) 653 .addImm(Alu->getOperand(1).getImm()) 654 .addImm(Alu->getOperand(2).getImm()) 655 .addImm(Alu->getOperand(3).getImm()) 656 .addImm(Alu->getOperand(4).getImm()) 657 .addImm(Alu->getOperand(5).getImm()) 658 .addImm(Alu->getOperand(6).getImm()) 659 .addImm(Alu->getOperand(7).getImm()) 660 .addImm(Alu->getOperand(8).getImm()); 661 Alu->eraseFromParent(); 662 } 663 MFI->StackSize = CFStack.MaxStackSize; 664 } 665 666 return false; 667 } 668 669 const char *getPassName() const override { 670 return "R600 Control Flow Finalizer Pass"; 671 } 672}; 673 674char R600ControlFlowFinalizer::ID = 0; 675 676} // end anonymous namespace 677 678 679llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { 680 return new R600ControlFlowFinalizer(TM); 681} 682