SIISelLowering.cpp revision a3c2bcf0ee2f63584f7a1e9df9fa153a8b5dfea1
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#include "SIISelLowering.h" 16#include "AMDGPU.h" 17#include "AMDILIntrinsicInfo.h" 18#include "SIInstrInfo.h" 19#include "SIMachineFunctionInfo.h" 20#include "SIRegisterInfo.h" 21#include "llvm/CodeGen/CallingConvLower.h" 22#include "llvm/CodeGen/MachineInstrBuilder.h" 23#include "llvm/CodeGen/MachineRegisterInfo.h" 24#include "llvm/CodeGen/SelectionDAG.h" 25#include "llvm/IR/Function.h" 26 27const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; 28 29using namespace llvm; 30 31SITargetLowering::SITargetLowering(TargetMachine &TM) : 32 AMDGPUTargetLowering(TM) { 33 34 addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 35 addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); 36 37 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 38 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 39 40 addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); 41 addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); 42 43 addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); 44 addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); 45 addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); 46 47 addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 48 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 49 addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 50 51 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 52 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 53 54 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 55 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 56 57 computeRegisterProperties(); 58 59 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 60 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 61 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 62 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 63 64 setOperationAction(ISD::ADD, MVT::i64, Legal); 65 setOperationAction(ISD::ADD, MVT::i32, Legal); 66 67 setOperationAction(ISD::BITCAST, MVT::i128, Legal); 68 69 // We need to custom lower vector stores from local memory 70 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 71 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 72 73 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 74 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 75 76 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 77 78 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 79 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 80 81 setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 82 setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); 83 84 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 85 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 86 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 87 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 88 89 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 90 91 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 92 93 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 94 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 95 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 96 97 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 98 99 setTargetDAGCombine(ISD::SELECT_CC); 100 101 setTargetDAGCombine(ISD::SETCC); 102 103 setSchedulingPreference(Sched::RegPressure); 104} 105 106//===----------------------------------------------------------------------===// 107// TargetLowering queries 108//===----------------------------------------------------------------------===// 109 110bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 111 bool *IsFast) const { 112 // XXX: This depends on the address space and also we may want to revist 113 // the alignment values we specify in the DataLayout. 114 return VT.bitsGT(MVT::i32); 115} 116 117bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { 118 return VT.bitsLE(MVT::i8); 119} 120 121SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, 122 SDLoc DL, SDValue Chain, 123 unsigned Offset) const { 124 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 125 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 126 AMDGPUAS::CONSTANT_ADDRESS); 127 EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); 128 SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 129 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 130 SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 131 DAG.getConstant(Offset, MVT::i64)); 132 return DAG.getLoad(VT, DL, Chain, Ptr, 133 MachinePointerInfo(UndefValue::get(PtrTy)), 134 false, false, false, ArgVT.getSizeInBits() >> 3); 135 136} 137 138SDValue SITargetLowering::LowerFormalArguments( 139 SDValue Chain, 140 CallingConv::ID CallConv, 141 bool isVarArg, 142 const SmallVectorImpl<ISD::InputArg> &Ins, 143 SDLoc DL, SelectionDAG &DAG, 144 SmallVectorImpl<SDValue> &InVals) const { 145 146 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 147 148 MachineFunction &MF = DAG.getMachineFunction(); 149 FunctionType *FType = MF.getFunction()->getFunctionType(); 150 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 151 152 assert(CallConv == CallingConv::C); 153 154 SmallVector<ISD::InputArg, 16> Splits; 155 uint32_t Skipped = 0; 156 157 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 158 const ISD::InputArg &Arg = Ins[i]; 159 160 // First check if it's a PS input addr 161 if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { 162 163 assert((PSInputNum <= 15) && "Too many PS inputs!"); 164 165 if (!Arg.Used) { 166 // We can savely skip PS inputs 167 Skipped |= 1 << i; 168 ++PSInputNum; 169 continue; 170 } 171 172 Info->PSInputAddr |= 1 << PSInputNum++; 173 } 174 175 // Second split vertices into their elements 176 if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 177 ISD::InputArg NewArg = Arg; 178 NewArg.Flags.setSplit(); 179 NewArg.VT = Arg.VT.getVectorElementType(); 180 181 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 182 // three or five element vertex only needs three or five registers, 183 // NOT four or eigth. 184 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 185 unsigned NumElements = ParamType->getVectorNumElements(); 186 187 for (unsigned j = 0; j != NumElements; ++j) { 188 Splits.push_back(NewArg); 189 NewArg.PartOffset += NewArg.VT.getStoreSize(); 190 } 191 192 } else { 193 Splits.push_back(Arg); 194 } 195 } 196 197 SmallVector<CCValAssign, 16> ArgLocs; 198 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 199 getTargetMachine(), ArgLocs, *DAG.getContext()); 200 201 // At least one interpolation mode must be enabled or else the GPU will hang. 202 if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 203 Info->PSInputAddr |= 1; 204 CCInfo.AllocateReg(AMDGPU::VGPR0); 205 CCInfo.AllocateReg(AMDGPU::VGPR1); 206 } 207 208 // The pointer to the list of arguments is stored in SGPR0, SGPR1 209 if (Info->ShaderType == ShaderType::COMPUTE) { 210 CCInfo.AllocateReg(AMDGPU::SGPR0); 211 CCInfo.AllocateReg(AMDGPU::SGPR1); 212 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 213 } 214 215 AnalyzeFormalArguments(CCInfo, Splits); 216 217 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 218 219 const ISD::InputArg &Arg = Ins[i]; 220 if (Skipped & (1 << i)) { 221 InVals.push_back(DAG.getUNDEF(Arg.VT)); 222 continue; 223 } 224 225 CCValAssign &VA = ArgLocs[ArgIdx++]; 226 EVT VT = VA.getLocVT(); 227 228 if (VA.isMemLoc()) { 229 // The first 36 bytes of the input buffer contains information about 230 // thread group and global sizes. 231 SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), 232 36 + VA.getLocMemOffset()); 233 InVals.push_back(Arg); 234 continue; 235 } 236 assert(VA.isRegLoc() && "Parameter must be in a register!"); 237 238 unsigned Reg = VA.getLocReg(); 239 240 if (VT == MVT::i64) { 241 // For now assume it is a pointer 242 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 243 &AMDGPU::SReg_64RegClass); 244 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 245 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 246 continue; 247 } 248 249 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 250 251 Reg = MF.addLiveIn(Reg, RC); 252 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 253 254 if (Arg.VT.isVector()) { 255 256 // Build a vector from the registers 257 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 258 unsigned NumElements = ParamType->getVectorNumElements(); 259 260 SmallVector<SDValue, 4> Regs; 261 Regs.push_back(Val); 262 for (unsigned j = 1; j != NumElements; ++j) { 263 Reg = ArgLocs[ArgIdx++].getLocReg(); 264 Reg = MF.addLiveIn(Reg, RC); 265 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 266 } 267 268 // Fill up the missing vector elements 269 NumElements = Arg.VT.getVectorNumElements() - NumElements; 270 for (unsigned j = 0; j != NumElements; ++j) 271 Regs.push_back(DAG.getUNDEF(VT)); 272 273 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 274 Regs.data(), Regs.size())); 275 continue; 276 } 277 278 InVals.push_back(Val); 279 } 280 return Chain; 281} 282 283MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 284 MachineInstr * MI, MachineBasicBlock * BB) const { 285 286 MachineBasicBlock::iterator I = *MI; 287 288 switch (MI->getOpcode()) { 289 default: 290 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 291 case AMDGPU::BRANCH: return BB; 292 case AMDGPU::SI_ADDR64_RSRC: { 293 const SIInstrInfo *TII = 294 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 295 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 296 unsigned SuperReg = MI->getOperand(0).getReg(); 297 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 298 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 299 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 300 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 301 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 302 .addOperand(MI->getOperand(1)); 303 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 304 .addImm(0); 305 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 306 .addImm(RSRC_DATA_FORMAT >> 32); 307 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 308 .addReg(SubRegHiLo) 309 .addImm(AMDGPU::sub0) 310 .addReg(SubRegHiHi) 311 .addImm(AMDGPU::sub1); 312 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 313 .addReg(SubRegLo) 314 .addImm(AMDGPU::sub0_sub1) 315 .addReg(SubRegHi) 316 .addImm(AMDGPU::sub2_sub3); 317 MI->eraseFromParent(); 318 break; 319 } 320 case AMDGPU::V_SUB_F64: { 321 const SIInstrInfo *TII = 322 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 323 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 324 MI->getOperand(0).getReg()) 325 .addReg(MI->getOperand(1).getReg()) 326 .addReg(MI->getOperand(2).getReg()) 327 .addImm(0) /* src2 */ 328 .addImm(0) /* ABS */ 329 .addImm(0) /* CLAMP */ 330 .addImm(0) /* OMOD */ 331 .addImm(2); /* NEG */ 332 MI->eraseFromParent(); 333 break; 334 } 335 } 336 return BB; 337} 338 339EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 340 if (!VT.isVector()) { 341 return MVT::i1; 342 } 343 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 344} 345 346MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 347 return MVT::i32; 348} 349 350bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 351 VT = VT.getScalarType(); 352 353 if (!VT.isSimple()) 354 return false; 355 356 switch (VT.getSimpleVT().SimpleTy) { 357 case MVT::f32: 358 return false; /* There is V_MAD_F32 for f32 */ 359 case MVT::f64: 360 return true; 361 default: 362 break; 363 } 364 365 return false; 366} 367 368//===----------------------------------------------------------------------===// 369// Custom DAG Lowering Operations 370//===----------------------------------------------------------------------===// 371 372SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 373 MachineFunction &MF = DAG.getMachineFunction(); 374 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 375 switch (Op.getOpcode()) { 376 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 377 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 378 case ISD::LOAD: { 379 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 380 if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 381 Op.getValueType().isVector()) { 382 SDValue MergedValues[2] = { 383 SplitVectorLoad(Op, DAG), 384 Load->getChain() 385 }; 386 return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); 387 } else { 388 return SDValue(); 389 } 390 } 391 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 392 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 393 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); 394 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 395 case ISD::INTRINSIC_WO_CHAIN: { 396 unsigned IntrinsicID = 397 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 398 EVT VT = Op.getValueType(); 399 SDLoc DL(Op); 400 //XXX: Hardcoded we only use two to store the pointer to the parameters. 401 unsigned NumUserSGPRs = 2; 402 switch (IntrinsicID) { 403 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 404 case Intrinsic::r600_read_ngroups_x: 405 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); 406 case Intrinsic::r600_read_ngroups_y: 407 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); 408 case Intrinsic::r600_read_ngroups_z: 409 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); 410 case Intrinsic::r600_read_global_size_x: 411 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); 412 case Intrinsic::r600_read_global_size_y: 413 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); 414 case Intrinsic::r600_read_global_size_z: 415 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); 416 case Intrinsic::r600_read_local_size_x: 417 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); 418 case Intrinsic::r600_read_local_size_y: 419 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); 420 case Intrinsic::r600_read_local_size_z: 421 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); 422 case Intrinsic::r600_read_tgid_x: 423 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 424 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 425 case Intrinsic::r600_read_tgid_y: 426 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 427 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 428 case Intrinsic::r600_read_tgid_z: 429 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 430 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 431 case Intrinsic::r600_read_tidig_x: 432 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 433 AMDGPU::VGPR0, VT); 434 case Intrinsic::r600_read_tidig_y: 435 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 436 AMDGPU::VGPR1, VT); 437 case Intrinsic::r600_read_tidig_z: 438 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 439 AMDGPU::VGPR2, VT); 440 case AMDGPUIntrinsic::SI_load_const: { 441 SDValue Ops [] = { 442 ResourceDescriptorToi128(Op.getOperand(1), DAG), 443 Op.getOperand(2) 444 }; 445 446 MachineMemOperand *MMO = MF.getMachineMemOperand( 447 MachinePointerInfo(), 448 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 449 VT.getSizeInBits() / 8, 4); 450 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 451 Op->getVTList(), Ops, 2, VT, MMO); 452 } 453 case AMDGPUIntrinsic::SI_sample: 454 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 455 case AMDGPUIntrinsic::SI_sampleb: 456 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 457 case AMDGPUIntrinsic::SI_sampled: 458 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 459 case AMDGPUIntrinsic::SI_samplel: 460 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 461 case AMDGPUIntrinsic::SI_vs_load_input: 462 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 463 ResourceDescriptorToi128(Op.getOperand(1), DAG), 464 Op.getOperand(2), 465 Op.getOperand(3)); 466 } 467 } 468 469 case ISD::INTRINSIC_VOID: 470 SDValue Chain = Op.getOperand(0); 471 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 472 473 switch (IntrinsicID) { 474 case AMDGPUIntrinsic::SI_tbuffer_store: { 475 SDLoc DL(Op); 476 SDValue Ops [] = { 477 Chain, 478 ResourceDescriptorToi128(Op.getOperand(2), DAG), 479 Op.getOperand(3), 480 Op.getOperand(4), 481 Op.getOperand(5), 482 Op.getOperand(6), 483 Op.getOperand(7), 484 Op.getOperand(8), 485 Op.getOperand(9), 486 Op.getOperand(10), 487 Op.getOperand(11), 488 Op.getOperand(12), 489 Op.getOperand(13), 490 Op.getOperand(14) 491 }; 492 EVT VT = Op.getOperand(3).getValueType(); 493 494 MachineMemOperand *MMO = MF.getMachineMemOperand( 495 MachinePointerInfo(), 496 MachineMemOperand::MOStore, 497 VT.getSizeInBits() / 8, 4); 498 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 499 Op->getVTList(), Ops, 500 sizeof(Ops)/sizeof(Ops[0]), VT, MMO); 501 } 502 default: 503 break; 504 } 505 } 506 return SDValue(); 507} 508 509/// \brief Helper function for LowerBRCOND 510static SDNode *findUser(SDValue Value, unsigned Opcode) { 511 512 SDNode *Parent = Value.getNode(); 513 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 514 I != E; ++I) { 515 516 if (I.getUse().get() != Value) 517 continue; 518 519 if (I->getOpcode() == Opcode) 520 return *I; 521 } 522 return 0; 523} 524 525/// This transforms the control flow intrinsics to get the branch destination as 526/// last parameter, also switches branch target with BR if the need arise 527SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 528 SelectionDAG &DAG) const { 529 530 SDLoc DL(BRCOND); 531 532 SDNode *Intr = BRCOND.getOperand(1).getNode(); 533 SDValue Target = BRCOND.getOperand(2); 534 SDNode *BR = 0; 535 536 if (Intr->getOpcode() == ISD::SETCC) { 537 // As long as we negate the condition everything is fine 538 SDNode *SetCC = Intr; 539 assert(SetCC->getConstantOperandVal(1) == 1); 540 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 541 ISD::SETNE); 542 Intr = SetCC->getOperand(0).getNode(); 543 544 } else { 545 // Get the target from BR if we don't negate the condition 546 BR = findUser(BRCOND, ISD::BR); 547 Target = BR->getOperand(1); 548 } 549 550 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 551 552 // Build the result and 553 SmallVector<EVT, 4> Res; 554 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 555 Res.push_back(Intr->getValueType(i)); 556 557 // operands of the new intrinsic call 558 SmallVector<SDValue, 4> Ops; 559 Ops.push_back(BRCOND.getOperand(0)); 560 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 561 Ops.push_back(Intr->getOperand(i)); 562 Ops.push_back(Target); 563 564 // build the new intrinsic call 565 SDNode *Result = DAG.getNode( 566 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 567 DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 568 569 if (BR) { 570 // Give the branch instruction our target 571 SDValue Ops[] = { 572 BR->getOperand(0), 573 BRCOND.getOperand(2) 574 }; 575 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 576 } 577 578 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 579 580 // Copy the intrinsic results to registers 581 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 582 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 583 if (!CopyToReg) 584 continue; 585 586 Chain = DAG.getCopyToReg( 587 Chain, DL, 588 CopyToReg->getOperand(1), 589 SDValue(Result, i - 1), 590 SDValue()); 591 592 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 593 } 594 595 // Remove the old intrinsic from the chain 596 DAG.ReplaceAllUsesOfValueWith( 597 SDValue(Intr, Intr->getNumValues() - 1), 598 Intr->getOperand(0)); 599 600 return Chain; 601} 602 603SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, 604 SelectionDAG &DAG) const { 605 606 if (Op.getValueType() == MVT::i128) { 607 return Op; 608 } 609 610 assert(Op.getOpcode() == ISD::UNDEF); 611 612 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, 613 DAG.getConstant(0, MVT::i64), 614 DAG.getConstant(0, MVT::i64)); 615} 616 617SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 618 const SDValue &Op, 619 SelectionDAG &DAG) const { 620 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 621 Op.getOperand(2), 622 ResourceDescriptorToi128(Op.getOperand(3), DAG), 623 Op.getOperand(4)); 624} 625 626SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 627 SDValue LHS = Op.getOperand(0); 628 SDValue RHS = Op.getOperand(1); 629 SDValue True = Op.getOperand(2); 630 SDValue False = Op.getOperand(3); 631 SDValue CC = Op.getOperand(4); 632 EVT VT = Op.getValueType(); 633 SDLoc DL(Op); 634 635 // Possible Min/Max pattern 636 SDValue MinMax = LowerMinMax(Op, DAG); 637 if (MinMax.getNode()) { 638 return MinMax; 639 } 640 641 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 642 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 643} 644 645SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 646 SelectionDAG &DAG) const { 647 EVT VT = Op.getValueType(); 648 SDLoc DL(Op); 649 650 if (VT != MVT::i64) { 651 return SDValue(); 652 } 653 654 SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 655 DAG.getConstant(31, MVT::i32)); 656 657 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 658} 659 660SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, 661 SelectionDAG &DAG) const { 662 EVT VT = Op.getValueType(); 663 SDLoc DL(Op); 664 665 if (VT != MVT::i64) { 666 return SDValue(); 667 } 668 669 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), 670 DAG.getConstant(0, MVT::i32)); 671} 672 673//===----------------------------------------------------------------------===// 674// Custom DAG optimizations 675//===----------------------------------------------------------------------===// 676 677SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 678 DAGCombinerInfo &DCI) const { 679 SelectionDAG &DAG = DCI.DAG; 680 SDLoc DL(N); 681 EVT VT = N->getValueType(0); 682 683 switch (N->getOpcode()) { 684 default: break; 685 case ISD::SELECT_CC: { 686 N->dump(); 687 ConstantSDNode *True, *False; 688 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 689 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 690 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 691 && True->isAllOnesValue() 692 && False->isNullValue() 693 && VT == MVT::i1) { 694 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 695 N->getOperand(1), N->getOperand(4)); 696 697 } 698 break; 699 } 700 case ISD::SETCC: { 701 SDValue Arg0 = N->getOperand(0); 702 SDValue Arg1 = N->getOperand(1); 703 SDValue CC = N->getOperand(2); 704 ConstantSDNode * C = NULL; 705 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 706 707 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 708 if (VT == MVT::i1 709 && Arg0.getOpcode() == ISD::SIGN_EXTEND 710 && Arg0.getOperand(0).getValueType() == MVT::i1 711 && (C = dyn_cast<ConstantSDNode>(Arg1)) 712 && C->isNullValue() 713 && CCOp == ISD::SETNE) { 714 return SimplifySetCC(VT, Arg0.getOperand(0), 715 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 716 } 717 break; 718 } 719 } 720 return SDValue(); 721} 722 723/// \brief Test if RegClass is one of the VSrc classes 724static bool isVSrc(unsigned RegClass) { 725 return AMDGPU::VSrc_32RegClassID == RegClass || 726 AMDGPU::VSrc_64RegClassID == RegClass; 727} 728 729/// \brief Test if RegClass is one of the SSrc classes 730static bool isSSrc(unsigned RegClass) { 731 return AMDGPU::SSrc_32RegClassID == RegClass || 732 AMDGPU::SSrc_64RegClassID == RegClass; 733} 734 735/// \brief Analyze the possible immediate value Op 736/// 737/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 738/// and the immediate value if it's a literal immediate 739int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 740 741 union { 742 int32_t I; 743 float F; 744 } Imm; 745 746 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 747 if (Node->getZExtValue() >> 32) { 748 return -1; 749 } 750 Imm.I = Node->getSExtValue(); 751 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 752 Imm.F = Node->getValueAPF().convertToFloat(); 753 else 754 return -1; // It isn't an immediate 755 756 if ((Imm.I >= -16 && Imm.I <= 64) || 757 Imm.F == 0.5f || Imm.F == -0.5f || 758 Imm.F == 1.0f || Imm.F == -1.0f || 759 Imm.F == 2.0f || Imm.F == -2.0f || 760 Imm.F == 4.0f || Imm.F == -4.0f) 761 return 0; // It's an inline immediate 762 763 return Imm.I; // It's a literal immediate 764} 765 766/// \brief Try to fold an immediate directly into an instruction 767bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 768 bool &ScalarSlotUsed) const { 769 770 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 771 const SIInstrInfo *TII = 772 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 773 if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 774 return false; 775 776 const SDValue &Op = Mov->getOperand(0); 777 int32_t Value = analyzeImmediate(Op.getNode()); 778 if (Value == -1) { 779 // Not an immediate at all 780 return false; 781 782 } else if (Value == 0) { 783 // Inline immediates can always be fold 784 Operand = Op; 785 return true; 786 787 } else if (Value == Immediate) { 788 // Already fold literal immediate 789 Operand = Op; 790 return true; 791 792 } else if (!ScalarSlotUsed && !Immediate) { 793 // Fold this literal immediate 794 ScalarSlotUsed = true; 795 Immediate = Value; 796 Operand = Op; 797 return true; 798 799 } 800 801 return false; 802} 803 804const TargetRegisterClass *SITargetLowering::getRegClassForNode( 805 SelectionDAG &DAG, const SDValue &Op) const { 806 const SIInstrInfo *TII = 807 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 808 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 809 810 if (!Op->isMachineOpcode()) { 811 switch(Op->getOpcode()) { 812 case ISD::CopyFromReg: { 813 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 814 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 815 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 816 return MRI.getRegClass(Reg); 817 } 818 return TRI.getPhysRegClass(Reg); 819 } 820 default: return NULL; 821 } 822 } 823 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 824 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 825 if (OpClassID != -1) { 826 return TRI.getRegClass(OpClassID); 827 } 828 switch(Op.getMachineOpcode()) { 829 case AMDGPU::COPY_TO_REGCLASS: 830 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 831 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 832 833 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 834 // class, then the register class for the value could be either a 835 // VReg or and SReg. In order to get a more accurate 836 if (OpClassID == AMDGPU::VSrc_32RegClassID || 837 OpClassID == AMDGPU::VSrc_64RegClassID) { 838 return getRegClassForNode(DAG, Op.getOperand(0)); 839 } 840 return TRI.getRegClass(OpClassID); 841 case AMDGPU::EXTRACT_SUBREG: { 842 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 843 const TargetRegisterClass *SuperClass = 844 getRegClassForNode(DAG, Op.getOperand(0)); 845 return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 846 } 847 case AMDGPU::REG_SEQUENCE: 848 // Operand 0 is the register class id for REG_SEQUENCE instructions. 849 return TRI.getRegClass( 850 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 851 default: 852 return getRegClassFor(Op.getSimpleValueType()); 853 } 854} 855 856/// \brief Does "Op" fit into register class "RegClass" ? 857bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 858 unsigned RegClass) const { 859 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 860 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 861 if (!RC) { 862 return false; 863 } 864 return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 865} 866 867/// \brief Make sure that we don't exeed the number of allowed scalars 868void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 869 unsigned RegClass, 870 bool &ScalarSlotUsed) const { 871 872 // First map the operands register class to a destination class 873 if (RegClass == AMDGPU::VSrc_32RegClassID) 874 RegClass = AMDGPU::VReg_32RegClassID; 875 else if (RegClass == AMDGPU::VSrc_64RegClassID) 876 RegClass = AMDGPU::VReg_64RegClassID; 877 else 878 return; 879 880 // Nothing todo if they fit naturaly 881 if (fitsRegClass(DAG, Operand, RegClass)) 882 return; 883 884 // If the scalar slot isn't used yet use it now 885 if (!ScalarSlotUsed) { 886 ScalarSlotUsed = true; 887 return; 888 } 889 890 // This is a conservative aproach, it is possible that we can't determine 891 // the correct register class and copy too often, but better save than sorry. 892 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 893 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 894 Operand.getValueType(), Operand, RC); 895 Operand = SDValue(Node, 0); 896} 897 898/// \returns true if \p Node's operands are different from the SDValue list 899/// \p Ops 900static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 901 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 902 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 903 return true; 904 } 905 } 906 return false; 907} 908 909/// \brief Try to fold the Nodes operands into the Node 910SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 911 SelectionDAG &DAG) const { 912 913 // Original encoding (either e32 or e64) 914 int Opcode = Node->getMachineOpcode(); 915 const SIInstrInfo *TII = 916 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 917 const MCInstrDesc *Desc = &TII->get(Opcode); 918 919 unsigned NumDefs = Desc->getNumDefs(); 920 unsigned NumOps = Desc->getNumOperands(); 921 922 // Commuted opcode if available 923 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 924 const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 925 926 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 927 assert(!DescRev || DescRev->getNumOperands() == NumOps); 928 929 // e64 version if available, -1 otherwise 930 int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 931 const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 932 933 assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 934 assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 935 936 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 937 bool HaveVSrc = false, HaveSSrc = false; 938 939 // First figure out what we alread have in this instruction 940 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 941 i != e && Op < NumOps; ++i, ++Op) { 942 943 unsigned RegClass = Desc->OpInfo[Op].RegClass; 944 if (isVSrc(RegClass)) 945 HaveVSrc = true; 946 else if (isSSrc(RegClass)) 947 HaveSSrc = true; 948 else 949 continue; 950 951 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 952 if (Imm != -1 && Imm != 0) { 953 // Literal immediate 954 Immediate = Imm; 955 } 956 } 957 958 // If we neither have VSrc nor SSrc it makes no sense to continue 959 if (!HaveVSrc && !HaveSSrc) 960 return Node; 961 962 // No scalar allowed when we have both VSrc and SSrc 963 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 964 965 // Second go over the operands and try to fold them 966 std::vector<SDValue> Ops; 967 bool Promote2e64 = false; 968 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 969 i != e && Op < NumOps; ++i, ++Op) { 970 971 const SDValue &Operand = Node->getOperand(i); 972 Ops.push_back(Operand); 973 974 // Already folded immediate ? 975 if (isa<ConstantSDNode>(Operand.getNode()) || 976 isa<ConstantFPSDNode>(Operand.getNode())) 977 continue; 978 979 // Is this a VSrc or SSrc operand ? 980 unsigned RegClass = Desc->OpInfo[Op].RegClass; 981 if (isVSrc(RegClass) || isSSrc(RegClass)) { 982 // Try to fold the immediates 983 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 984 // Folding didn't worked, make sure we don't hit the SReg limit 985 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 986 } 987 continue; 988 } 989 990 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 991 992 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 993 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 994 995 // Test if it makes sense to swap operands 996 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 997 (!fitsRegClass(DAG, Ops[1], RegClass) && 998 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 999 1000 // Swap commutable operands 1001 SDValue Tmp = Ops[1]; 1002 Ops[1] = Ops[0]; 1003 Ops[0] = Tmp; 1004 1005 Desc = DescRev; 1006 DescRev = 0; 1007 continue; 1008 } 1009 } 1010 1011 if (DescE64 && !Immediate) { 1012 1013 // Test if it makes sense to switch to e64 encoding 1014 unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 1015 if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 1016 continue; 1017 1018 int32_t TmpImm = -1; 1019 if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 1020 (!fitsRegClass(DAG, Ops[i], RegClass) && 1021 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1022 1023 // Switch to e64 encoding 1024 Immediate = -1; 1025 Promote2e64 = true; 1026 Desc = DescE64; 1027 DescE64 = 0; 1028 } 1029 } 1030 } 1031 1032 if (Promote2e64) { 1033 // Add the modifier flags while promoting 1034 for (unsigned i = 0; i < 4; ++i) 1035 Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 1036 } 1037 1038 // Add optional chain and glue 1039 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 1040 Ops.push_back(Node->getOperand(i)); 1041 1042 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 1043 // this case a brand new node is always be created, even if the operands 1044 // are the same as before. So, manually check if anything has been changed. 1045 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 1046 return Node; 1047 } 1048 1049 // Create a complete new instruction 1050 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 1051} 1052 1053/// \brief Helper function for adjustWritemask 1054static unsigned SubIdx2Lane(unsigned Idx) { 1055 switch (Idx) { 1056 default: return 0; 1057 case AMDGPU::sub0: return 0; 1058 case AMDGPU::sub1: return 1; 1059 case AMDGPU::sub2: return 2; 1060 case AMDGPU::sub3: return 3; 1061 } 1062} 1063 1064/// \brief Adjust the writemask of MIMG instructions 1065void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1066 SelectionDAG &DAG) const { 1067 SDNode *Users[4] = { }; 1068 unsigned Writemask = 0, Lane = 0; 1069 1070 // Try to figure out the used register components 1071 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1072 I != E; ++I) { 1073 1074 // Abort if we can't understand the usage 1075 if (!I->isMachineOpcode() || 1076 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1077 return; 1078 1079 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1080 1081 // Abort if we have more than one user per component 1082 if (Users[Lane]) 1083 return; 1084 1085 Users[Lane] = *I; 1086 Writemask |= 1 << Lane; 1087 } 1088 1089 // Abort if all components are used 1090 if (Writemask == 0xf) 1091 return; 1092 1093 // Adjust the writemask in the node 1094 std::vector<SDValue> Ops; 1095 Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); 1096 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 1097 Ops.push_back(Node->getOperand(i)); 1098 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 1099 1100 // If we only got one lane, replace it with a copy 1101 if (Writemask == (1U << Lane)) { 1102 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 1103 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1104 SDLoc(), Users[Lane]->getValueType(0), 1105 SDValue(Node, 0), RC); 1106 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1107 return; 1108 } 1109 1110 // Update the users of the node with the new indices 1111 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1112 1113 SDNode *User = Users[i]; 1114 if (!User) 1115 continue; 1116 1117 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1118 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1119 1120 switch (Idx) { 1121 default: break; 1122 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1123 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1124 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1125 } 1126 } 1127} 1128 1129/// \brief Fold the instructions after slecting them 1130SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1131 SelectionDAG &DAG) const { 1132 const SIInstrInfo *TII = 1133 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1134 Node = AdjustRegClass(Node, DAG); 1135 1136 if (TII->isMIMG(Node->getMachineOpcode())) 1137 adjustWritemask(Node, DAG); 1138 1139 return foldOperands(Node, DAG); 1140} 1141 1142/// \brief Assign the register class depending on the number of 1143/// bits set in the writemask 1144void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1145 SDNode *Node) const { 1146 const SIInstrInfo *TII = 1147 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1148 if (!TII->isMIMG(MI->getOpcode())) 1149 return; 1150 1151 unsigned VReg = MI->getOperand(0).getReg(); 1152 unsigned Writemask = MI->getOperand(1).getImm(); 1153 unsigned BitsSet = 0; 1154 for (unsigned i = 0; i < 4; ++i) 1155 BitsSet += Writemask & (1 << i) ? 1 : 0; 1156 1157 const TargetRegisterClass *RC; 1158 switch (BitsSet) { 1159 default: return; 1160 case 1: RC = &AMDGPU::VReg_32RegClass; break; 1161 case 2: RC = &AMDGPU::VReg_64RegClass; break; 1162 case 3: RC = &AMDGPU::VReg_96RegClass; break; 1163 } 1164 1165 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1166 MRI.setRegClass(VReg, RC); 1167} 1168 1169MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1170 SelectionDAG &DAG) const { 1171 1172 SDLoc DL(N); 1173 unsigned NewOpcode = N->getMachineOpcode(); 1174 1175 switch (N->getMachineOpcode()) { 1176 default: return N; 1177 case AMDGPU::S_LOAD_DWORD_IMM: 1178 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1179 // Fall-through 1180 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1181 if (NewOpcode == N->getMachineOpcode()) { 1182 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1183 } 1184 // Fall-through 1185 case AMDGPU::S_LOAD_DWORDX4_IMM: 1186 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1187 if (NewOpcode == N->getMachineOpcode()) { 1188 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1189 } 1190 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1191 return N; 1192 } 1193 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1194 SDValue Ops[] = { 1195 SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1196 DAG.getConstant(0, MVT::i64)), 0), 1197 N->getOperand(0), 1198 DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1199 }; 1200 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1201 } 1202 } 1203} 1204 1205SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1206 const TargetRegisterClass *RC, 1207 unsigned Reg, EVT VT) const { 1208 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1209 1210 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1211 cast<RegisterSDNode>(VReg)->getReg(), VT); 1212} 1213