SIISelLowering.cpp revision 36ba9091843bd1205fe3499ba4b55bbedc6583c9
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#include "SIISelLowering.h" 16#include "AMDIL.h" 17#include "AMDILIntrinsicInfo.h" 18#include "SIInstrInfo.h" 19#include "SIMachineFunctionInfo.h" 20#include "SIRegisterInfo.h" 21#include "llvm/CodeGen/MachineInstrBuilder.h" 22#include "llvm/CodeGen/MachineRegisterInfo.h" 23#include "llvm/CodeGen/SelectionDAG.h" 24 25using namespace llvm; 26 27SITargetLowering::SITargetLowering(TargetMachine &TM) : 28 AMDGPUTargetLowering(TM), 29 TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) { 30 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 31 addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); 32 addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); 33 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 34 addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass); 35 addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass); 36 37 addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); 38 addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); 39 addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 40 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 41 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 42 43 computeRegisterProperties(); 44 45 setOperationAction(ISD::AND, MVT::i1, Custom); 46 47 setOperationAction(ISD::ADD, MVT::i64, Legal); 48 setOperationAction(ISD::ADD, MVT::i32, Legal); 49 50 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 51 52 // We need to custom lower loads from the USER_SGPR address space, so we can 53 // add the SGPRs as livein registers. 54 setOperationAction(ISD::LOAD, MVT::i32, Custom); 55 setOperationAction(ISD::LOAD, MVT::i64, Custom); 56 57 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 58 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 59 60 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 61 setTargetDAGCombine(ISD::SELECT_CC); 62 63 setTargetDAGCombine(ISD::SETCC); 64} 65 66MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 67 MachineInstr * MI, MachineBasicBlock * BB) const { 68 const TargetInstrInfo * TII = getTargetMachine().getInstrInfo(); 69 MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); 70 MachineBasicBlock::iterator I = MI; 71 72 switch (MI->getOpcode()) { 73 default: 74 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 75 case AMDGPU::BRANCH: return BB; 76 case AMDGPU::CLAMP_SI: 77 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) 78 .addOperand(MI->getOperand(0)) 79 .addOperand(MI->getOperand(1)) 80 // VSRC1-2 are unused, but we still need to fill all the 81 // operand slots, so we just reuse the VSRC0 operand 82 .addOperand(MI->getOperand(1)) 83 .addOperand(MI->getOperand(1)) 84 .addImm(0) // ABS 85 .addImm(1) // CLAMP 86 .addImm(0) // OMOD 87 .addImm(0); // NEG 88 MI->eraseFromParent(); 89 break; 90 91 case AMDGPU::FABS_SI: 92 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) 93 .addOperand(MI->getOperand(0)) 94 .addOperand(MI->getOperand(1)) 95 // VSRC1-2 are unused, but we still need to fill all the 96 // operand slots, so we just reuse the VSRC0 operand 97 .addOperand(MI->getOperand(1)) 98 .addOperand(MI->getOperand(1)) 99 .addImm(1) // ABS 100 .addImm(0) // CLAMP 101 .addImm(0) // OMOD 102 .addImm(0); // NEG 103 MI->eraseFromParent(); 104 break; 105 106 case AMDGPU::FNEG_SI: 107 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) 108 .addOperand(MI->getOperand(0)) 109 .addOperand(MI->getOperand(1)) 110 // VSRC1-2 are unused, but we still need to fill all the 111 // operand slots, so we just reuse the VSRC0 operand 112 .addOperand(MI->getOperand(1)) 113 .addOperand(MI->getOperand(1)) 114 .addImm(0) // ABS 115 .addImm(0) // CLAMP 116 .addImm(0) // OMOD 117 .addImm(1); // NEG 118 MI->eraseFromParent(); 119 break; 120 case AMDGPU::SHADER_TYPE: 121 BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType = 122 MI->getOperand(0).getImm(); 123 MI->eraseFromParent(); 124 break; 125 126 case AMDGPU::SI_INTERP: 127 LowerSI_INTERP(MI, *BB, I, MRI); 128 break; 129 case AMDGPU::SI_INTERP_CONST: 130 LowerSI_INTERP_CONST(MI, *BB, I, MRI); 131 break; 132 case AMDGPU::SI_WQM: 133 LowerSI_WQM(MI, *BB, I, MRI); 134 break; 135 case AMDGPU::SI_V_CNDLT: 136 LowerSI_V_CNDLT(MI, *BB, I, MRI); 137 break; 138 } 139 return BB; 140} 141 142void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, 143 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { 144 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) 145 .addReg(AMDGPU::EXEC); 146 147 MI->eraseFromParent(); 148} 149 150void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, 151 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { 152 unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); 153 unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); 154 MachineOperand dst = MI->getOperand(0); 155 MachineOperand iReg = MI->getOperand(1); 156 MachineOperand jReg = MI->getOperand(2); 157 MachineOperand attr_chan = MI->getOperand(3); 158 MachineOperand attr = MI->getOperand(4); 159 MachineOperand params = MI->getOperand(5); 160 161 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) 162 .addOperand(params); 163 164 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp) 165 .addOperand(iReg) 166 .addOperand(attr_chan) 167 .addOperand(attr) 168 .addReg(M0); 169 170 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32)) 171 .addOperand(dst) 172 .addReg(tmp) 173 .addOperand(jReg) 174 .addOperand(attr_chan) 175 .addOperand(attr) 176 .addReg(M0); 177 178 MI->eraseFromParent(); 179} 180 181void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI, 182 MachineBasicBlock &BB, MachineBasicBlock::iterator I, 183 MachineRegisterInfo &MRI) const { 184 MachineOperand dst = MI->getOperand(0); 185 MachineOperand attr_chan = MI->getOperand(1); 186 MachineOperand attr = MI->getOperand(2); 187 MachineOperand params = MI->getOperand(3); 188 unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); 189 190 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) 191 .addOperand(params); 192 193 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32)) 194 .addOperand(dst) 195 .addOperand(attr_chan) 196 .addOperand(attr) 197 .addReg(M0); 198 199 MI->eraseFromParent(); 200} 201 202void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, 203 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { 204 unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 205 206 BuildMI(BB, I, BB.findDebugLoc(I), 207 TII->get(AMDGPU::V_CMP_GT_F32_e32), 208 VCC) 209 .addReg(AMDGPU::SREG_LIT_0) 210 .addOperand(MI->getOperand(1)); 211 212 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32)) 213 .addOperand(MI->getOperand(0)) 214 .addOperand(MI->getOperand(3)) 215 .addOperand(MI->getOperand(2)) 216 .addReg(VCC); 217 218 MI->eraseFromParent(); 219} 220 221EVT SITargetLowering::getSetCCResultType(EVT VT) const { 222 return MVT::i1; 223} 224 225//===----------------------------------------------------------------------===// 226// Custom DAG Lowering Operations 227//===----------------------------------------------------------------------===// 228 229SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 230 switch (Op.getOpcode()) { 231 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 232 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 233 case ISD::LOAD: return LowerLOAD(Op, DAG); 234 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 235 case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND); 236 case ISD::INTRINSIC_WO_CHAIN: { 237 unsigned IntrinsicID = 238 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 239 EVT VT = Op.getValueType(); 240 switch (IntrinsicID) { 241 case AMDGPUIntrinsic::SI_vs_load_buffer_index: 242 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 243 AMDGPU::VGPR0, VT); 244 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 245 } 246 break; 247 } 248 } 249 return SDValue(); 250} 251 252/// \brief The function is for lowering i1 operations on the 253/// VCC register. 254/// 255/// In the VALU context, VCC is a one bit register, but in the 256/// SALU context the VCC is a 64-bit register (1-bit per thread). Since only 257/// the SALU can perform operations on the VCC register, we need to promote 258/// the operand types from i1 to i64 in order for tablegen to be able to match 259/// this operation to the correct SALU instruction. We do this promotion by 260/// wrapping the operands in a CopyToReg node. 261/// 262SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op, 263 SelectionDAG &DAG, 264 unsigned VCCNode) const { 265 DebugLoc DL = Op.getDebugLoc(); 266 267 SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64, 268 DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, 269 Op.getOperand(0)), 270 DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, 271 Op.getOperand(1))); 272 273 return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode); 274} 275 276/// \brief Helper function for LowerBRCOND 277static SDNode *findUser(SDValue Value, unsigned Opcode) { 278 279 SDNode *Parent = Value.getNode(); 280 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 281 I != E; ++I) { 282 283 if (I.getUse().get() != Value) 284 continue; 285 286 if (I->getOpcode() == Opcode) 287 return *I; 288 } 289 return 0; 290} 291 292/// This transforms the control flow intrinsics to get the branch destination as 293/// last parameter, also switches branch target with BR if the need arise 294SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 295 SelectionDAG &DAG) const { 296 297 DebugLoc DL = BRCOND.getDebugLoc(); 298 299 SDNode *Intr = BRCOND.getOperand(1).getNode(); 300 SDValue Target = BRCOND.getOperand(2); 301 SDNode *BR = 0; 302 303 if (Intr->getOpcode() == ISD::SETCC) { 304 // As long as we negate the condition everything is fine 305 SDNode *SetCC = Intr; 306 assert(SetCC->getConstantOperandVal(1) == 1); 307 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 308 ISD::SETNE); 309 Intr = SetCC->getOperand(0).getNode(); 310 311 } else { 312 // Get the target from BR if we don't negate the condition 313 BR = findUser(BRCOND, ISD::BR); 314 Target = BR->getOperand(1); 315 } 316 317 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 318 319 // Build the result and 320 SmallVector<EVT, 4> Res; 321 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 322 Res.push_back(Intr->getValueType(i)); 323 324 // operands of the new intrinsic call 325 SmallVector<SDValue, 4> Ops; 326 Ops.push_back(BRCOND.getOperand(0)); 327 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 328 Ops.push_back(Intr->getOperand(i)); 329 Ops.push_back(Target); 330 331 // build the new intrinsic call 332 SDNode *Result = DAG.getNode( 333 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 334 DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 335 336 if (BR) { 337 // Give the branch instruction our target 338 SDValue Ops[] = { 339 BR->getOperand(0), 340 BRCOND.getOperand(2) 341 }; 342 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 343 } 344 345 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 346 347 // Copy the intrinsic results to registers 348 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 349 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 350 if (!CopyToReg) 351 continue; 352 353 Chain = DAG.getCopyToReg( 354 Chain, DL, 355 CopyToReg->getOperand(1), 356 SDValue(Result, i - 1), 357 SDValue()); 358 359 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 360 } 361 362 // Remove the old intrinsic from the chain 363 DAG.ReplaceAllUsesOfValueWith( 364 SDValue(Intr, Intr->getNumValues() - 1), 365 Intr->getOperand(0)); 366 367 return Chain; 368} 369 370SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 371 EVT VT = Op.getValueType(); 372 LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op); 373 374 assert(Ptr); 375 376 unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace(); 377 378 // We only need to lower USER_SGPR address space loads 379 if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) { 380 return SDValue(); 381 } 382 383 // Loads from the USER_SGPR address space can only have constant value 384 // pointers. 385 ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr()); 386 assert(BasePtr); 387 388 unsigned TypeDwordWidth = VT.getSizeInBits() / 32; 389 const TargetRegisterClass * dstClass; 390 switch (TypeDwordWidth) { 391 default: 392 assert(!"USER_SGPR value size not implemented"); 393 return SDValue(); 394 case 1: 395 dstClass = &AMDGPU::SReg_32RegClass; 396 break; 397 case 2: 398 dstClass = &AMDGPU::SReg_64RegClass; 399 break; 400 } 401 uint64_t Index = BasePtr->getZExtValue(); 402 assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned"); 403 unsigned SGPRIndex = Index / TypeDwordWidth; 404 unsigned Reg = dstClass->getRegister(SGPRIndex); 405 406 DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg, 407 VT)); 408 return SDValue(); 409} 410 411SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 412 SDValue LHS = Op.getOperand(0); 413 SDValue RHS = Op.getOperand(1); 414 SDValue True = Op.getOperand(2); 415 SDValue False = Op.getOperand(3); 416 SDValue CC = Op.getOperand(4); 417 EVT VT = Op.getValueType(); 418 DebugLoc DL = Op.getDebugLoc(); 419 420 // Possible Min/Max pattern 421 SDValue MinMax = LowerMinMax(Op, DAG); 422 if (MinMax.getNode()) { 423 return MinMax; 424 } 425 426 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 427 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 428} 429 430//===----------------------------------------------------------------------===// 431// Custom DAG optimizations 432//===----------------------------------------------------------------------===// 433 434SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 435 DAGCombinerInfo &DCI) const { 436 SelectionDAG &DAG = DCI.DAG; 437 DebugLoc DL = N->getDebugLoc(); 438 EVT VT = N->getValueType(0); 439 440 switch (N->getOpcode()) { 441 default: break; 442 case ISD::SELECT_CC: { 443 N->dump(); 444 ConstantSDNode *True, *False; 445 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 446 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 447 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 448 && True->isAllOnesValue() 449 && False->isNullValue() 450 && VT == MVT::i1) { 451 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 452 N->getOperand(1), N->getOperand(4)); 453 454 } 455 break; 456 } 457 case ISD::SETCC: { 458 SDValue Arg0 = N->getOperand(0); 459 SDValue Arg1 = N->getOperand(1); 460 SDValue CC = N->getOperand(2); 461 ConstantSDNode * C = NULL; 462 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 463 464 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 465 if (VT == MVT::i1 466 && Arg0.getOpcode() == ISD::SIGN_EXTEND 467 && Arg0.getOperand(0).getValueType() == MVT::i1 468 && (C = dyn_cast<ConstantSDNode>(Arg1)) 469 && C->isNullValue() 470 && CCOp == ISD::SETNE) { 471 return SimplifySetCC(VT, Arg0.getOperand(0), 472 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 473 } 474 break; 475 } 476 } 477 return SDValue(); 478} 479 480#define NODE_NAME_CASE(node) case SIISD::node: return #node; 481 482const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const { 483 switch (Opcode) { 484 default: return AMDGPUTargetLowering::getTargetNodeName(Opcode); 485 NODE_NAME_CASE(VCC_AND) 486 NODE_NAME_CASE(VCC_BITCAST) 487 } 488} 489