SIISelLowering.cpp revision 4e518fd941b119834b5764708fbabf41adc45040
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#include "SIISelLowering.h" 16#include "AMDGPU.h" 17#include "AMDILIntrinsicInfo.h" 18#include "SIInstrInfo.h" 19#include "SIMachineFunctionInfo.h" 20#include "SIRegisterInfo.h" 21#include "llvm/CodeGen/CallingConvLower.h" 22#include "llvm/CodeGen/MachineInstrBuilder.h" 23#include "llvm/CodeGen/MachineRegisterInfo.h" 24#include "llvm/CodeGen/SelectionDAG.h" 25#include "llvm/IR/Function.h" 26 27const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; 28 29using namespace llvm; 30 31SITargetLowering::SITargetLowering(TargetMachine &TM) : 32 AMDGPUTargetLowering(TM) { 33 34 addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 35 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 36 37 addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); 38 addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); 39 40 addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); 41 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 42 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 43 44 addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); 45 addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); 46 47 addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); 48 49 addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); 50 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 51 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 52 53 addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 54 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 55 addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 56 57 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 58 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 59 60 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 61 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 62 63 computeRegisterProperties(); 64 65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 69 70 setOperationAction(ISD::ADD, MVT::i64, Legal); 71 setOperationAction(ISD::ADD, MVT::i32, Legal); 72 73 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 74 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 75 76 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 77 78 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 79 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 80 81 setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 82 83 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 84 85 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 86 87 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 88 89 setTargetDAGCombine(ISD::SELECT_CC); 90 91 setTargetDAGCombine(ISD::SETCC); 92 93 setSchedulingPreference(Sched::RegPressure); 94} 95 96//===----------------------------------------------------------------------===// 97// TargetLowering queries 98//===----------------------------------------------------------------------===// 99 100bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 101 bool *IsFast) const { 102 // XXX: This depends on the address space and also we may want to revist 103 // the alignment values we specify in the DataLayout. 104 return VT.bitsGT(MVT::i32); 105} 106 107 108SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, 109 SDLoc DL, SDValue Chain, 110 unsigned Offset) const { 111 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 112 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 113 AMDGPUAS::CONSTANT_ADDRESS); 114 EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); 115 SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 116 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 117 SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 118 DAG.getConstant(Offset, MVT::i64)); 119 return DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, Chain, Ptr, 120 MachinePointerInfo(UndefValue::get(PtrTy)), 121 VT, false, false, ArgVT.getSizeInBits() >> 3); 122 123} 124 125SDValue SITargetLowering::LowerFormalArguments( 126 SDValue Chain, 127 CallingConv::ID CallConv, 128 bool isVarArg, 129 const SmallVectorImpl<ISD::InputArg> &Ins, 130 SDLoc DL, SelectionDAG &DAG, 131 SmallVectorImpl<SDValue> &InVals) const { 132 133 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 134 135 MachineFunction &MF = DAG.getMachineFunction(); 136 FunctionType *FType = MF.getFunction()->getFunctionType(); 137 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 138 139 assert(CallConv == CallingConv::C); 140 141 SmallVector<ISD::InputArg, 16> Splits; 142 uint32_t Skipped = 0; 143 144 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 145 const ISD::InputArg &Arg = Ins[i]; 146 147 // First check if it's a PS input addr 148 if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { 149 150 assert((PSInputNum <= 15) && "Too many PS inputs!"); 151 152 if (!Arg.Used) { 153 // We can savely skip PS inputs 154 Skipped |= 1 << i; 155 ++PSInputNum; 156 continue; 157 } 158 159 Info->PSInputAddr |= 1 << PSInputNum++; 160 } 161 162 // Second split vertices into their elements 163 if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 164 ISD::InputArg NewArg = Arg; 165 NewArg.Flags.setSplit(); 166 NewArg.VT = Arg.VT.getVectorElementType(); 167 168 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 169 // three or five element vertex only needs three or five registers, 170 // NOT four or eigth. 171 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 172 unsigned NumElements = ParamType->getVectorNumElements(); 173 174 for (unsigned j = 0; j != NumElements; ++j) { 175 Splits.push_back(NewArg); 176 NewArg.PartOffset += NewArg.VT.getStoreSize(); 177 } 178 179 } else { 180 Splits.push_back(Arg); 181 } 182 } 183 184 SmallVector<CCValAssign, 16> ArgLocs; 185 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 186 getTargetMachine(), ArgLocs, *DAG.getContext()); 187 188 // At least one interpolation mode must be enabled or else the GPU will hang. 189 if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 190 Info->PSInputAddr |= 1; 191 CCInfo.AllocateReg(AMDGPU::VGPR0); 192 CCInfo.AllocateReg(AMDGPU::VGPR1); 193 } 194 195 // The pointer to the list of arguments is stored in SGPR0, SGPR1 196 if (Info->ShaderType == ShaderType::COMPUTE) { 197 CCInfo.AllocateReg(AMDGPU::SGPR0); 198 CCInfo.AllocateReg(AMDGPU::SGPR1); 199 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 200 } 201 202 AnalyzeFormalArguments(CCInfo, Splits); 203 204 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 205 206 const ISD::InputArg &Arg = Ins[i]; 207 if (Skipped & (1 << i)) { 208 InVals.push_back(DAG.getUNDEF(Arg.VT)); 209 continue; 210 } 211 212 CCValAssign &VA = ArgLocs[ArgIdx++]; 213 EVT VT = VA.getLocVT(); 214 215 if (VA.isMemLoc()) { 216 // The first 36 bytes of the input buffer contains information about 217 // thread group and global sizes. 218 SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), 219 36 + VA.getLocMemOffset()); 220 InVals.push_back(Arg); 221 continue; 222 } 223 assert(VA.isRegLoc() && "Parameter must be in a register!"); 224 225 unsigned Reg = VA.getLocReg(); 226 227 if (VT == MVT::i64) { 228 // For now assume it is a pointer 229 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 230 &AMDGPU::SReg_64RegClass); 231 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 232 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 233 continue; 234 } 235 236 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 237 238 Reg = MF.addLiveIn(Reg, RC); 239 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 240 241 if (Arg.VT.isVector()) { 242 243 // Build a vector from the registers 244 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 245 unsigned NumElements = ParamType->getVectorNumElements(); 246 247 SmallVector<SDValue, 4> Regs; 248 Regs.push_back(Val); 249 for (unsigned j = 1; j != NumElements; ++j) { 250 Reg = ArgLocs[ArgIdx++].getLocReg(); 251 Reg = MF.addLiveIn(Reg, RC); 252 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 253 } 254 255 // Fill up the missing vector elements 256 NumElements = Arg.VT.getVectorNumElements() - NumElements; 257 for (unsigned j = 0; j != NumElements; ++j) 258 Regs.push_back(DAG.getUNDEF(VT)); 259 260 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 261 Regs.data(), Regs.size())); 262 continue; 263 } 264 265 InVals.push_back(Val); 266 } 267 return Chain; 268} 269 270MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 271 MachineInstr * MI, MachineBasicBlock * BB) const { 272 273 MachineBasicBlock::iterator I = *MI; 274 275 switch (MI->getOpcode()) { 276 default: 277 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 278 case AMDGPU::BRANCH: return BB; 279 case AMDGPU::SI_ADDR64_RSRC: { 280 const SIInstrInfo *TII = 281 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 282 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 283 unsigned SuperReg = MI->getOperand(0).getReg(); 284 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 285 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 286 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 287 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 288 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 289 .addOperand(MI->getOperand(1)); 290 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 291 .addImm(0); 292 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 293 .addImm(RSRC_DATA_FORMAT >> 32); 294 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 295 .addReg(SubRegHiLo) 296 .addImm(AMDGPU::sub0) 297 .addReg(SubRegHiHi) 298 .addImm(AMDGPU::sub1); 299 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 300 .addReg(SubRegLo) 301 .addImm(AMDGPU::sub0_sub1) 302 .addReg(SubRegHi) 303 .addImm(AMDGPU::sub2_sub3); 304 MI->eraseFromParent(); 305 break; 306 } 307 case AMDGPU::V_SUB_F64: { 308 const SIInstrInfo *TII = 309 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 310 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 311 MI->getOperand(0).getReg()) 312 .addReg(MI->getOperand(1).getReg()) 313 .addReg(MI->getOperand(2).getReg()) 314 .addImm(0) /* src2 */ 315 .addImm(0) /* ABS */ 316 .addImm(0) /* CLAMP */ 317 .addImm(0) /* OMOD */ 318 .addImm(2); /* NEG */ 319 MI->eraseFromParent(); 320 break; 321 } 322 } 323 return BB; 324} 325 326EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 327 if (!VT.isVector()) { 328 return MVT::i1; 329 } 330 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 331} 332 333MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 334 return MVT::i32; 335} 336 337//===----------------------------------------------------------------------===// 338// Custom DAG Lowering Operations 339//===----------------------------------------------------------------------===// 340 341SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 342 MachineFunction &MF = DAG.getMachineFunction(); 343 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 344 switch (Op.getOpcode()) { 345 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 346 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 347 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 348 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 349 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 350 case ISD::INTRINSIC_WO_CHAIN: { 351 unsigned IntrinsicID = 352 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 353 EVT VT = Op.getValueType(); 354 SDLoc DL(Op); 355 //XXX: Hardcoded we only use two to store the pointer to the parameters. 356 unsigned NumUserSGPRs = 2; 357 switch (IntrinsicID) { 358 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 359 case Intrinsic::r600_read_ngroups_x: 360 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); 361 case Intrinsic::r600_read_ngroups_y: 362 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); 363 case Intrinsic::r600_read_ngroups_z: 364 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); 365 case Intrinsic::r600_read_global_size_x: 366 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); 367 case Intrinsic::r600_read_global_size_y: 368 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); 369 case Intrinsic::r600_read_global_size_z: 370 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); 371 case Intrinsic::r600_read_local_size_x: 372 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); 373 case Intrinsic::r600_read_local_size_y: 374 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); 375 case Intrinsic::r600_read_local_size_z: 376 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); 377 case Intrinsic::r600_read_tgid_x: 378 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 379 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 380 case Intrinsic::r600_read_tgid_y: 381 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 382 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 383 case Intrinsic::r600_read_tgid_z: 384 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 385 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 386 case Intrinsic::r600_read_tidig_x: 387 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 388 AMDGPU::VGPR0, VT); 389 case Intrinsic::r600_read_tidig_y: 390 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 391 AMDGPU::VGPR1, VT); 392 case Intrinsic::r600_read_tidig_z: 393 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 394 AMDGPU::VGPR2, VT); 395 396 } 397 } 398 } 399 return SDValue(); 400} 401 402/// \brief Helper function for LowerBRCOND 403static SDNode *findUser(SDValue Value, unsigned Opcode) { 404 405 SDNode *Parent = Value.getNode(); 406 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 407 I != E; ++I) { 408 409 if (I.getUse().get() != Value) 410 continue; 411 412 if (I->getOpcode() == Opcode) 413 return *I; 414 } 415 return 0; 416} 417 418/// This transforms the control flow intrinsics to get the branch destination as 419/// last parameter, also switches branch target with BR if the need arise 420SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 421 SelectionDAG &DAG) const { 422 423 SDLoc DL(BRCOND); 424 425 SDNode *Intr = BRCOND.getOperand(1).getNode(); 426 SDValue Target = BRCOND.getOperand(2); 427 SDNode *BR = 0; 428 429 if (Intr->getOpcode() == ISD::SETCC) { 430 // As long as we negate the condition everything is fine 431 SDNode *SetCC = Intr; 432 assert(SetCC->getConstantOperandVal(1) == 1); 433 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 434 ISD::SETNE); 435 Intr = SetCC->getOperand(0).getNode(); 436 437 } else { 438 // Get the target from BR if we don't negate the condition 439 BR = findUser(BRCOND, ISD::BR); 440 Target = BR->getOperand(1); 441 } 442 443 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 444 445 // Build the result and 446 SmallVector<EVT, 4> Res; 447 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 448 Res.push_back(Intr->getValueType(i)); 449 450 // operands of the new intrinsic call 451 SmallVector<SDValue, 4> Ops; 452 Ops.push_back(BRCOND.getOperand(0)); 453 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 454 Ops.push_back(Intr->getOperand(i)); 455 Ops.push_back(Target); 456 457 // build the new intrinsic call 458 SDNode *Result = DAG.getNode( 459 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 460 DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 461 462 if (BR) { 463 // Give the branch instruction our target 464 SDValue Ops[] = { 465 BR->getOperand(0), 466 BRCOND.getOperand(2) 467 }; 468 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 469 } 470 471 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 472 473 // Copy the intrinsic results to registers 474 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 475 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 476 if (!CopyToReg) 477 continue; 478 479 Chain = DAG.getCopyToReg( 480 Chain, DL, 481 CopyToReg->getOperand(1), 482 SDValue(Result, i - 1), 483 SDValue()); 484 485 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 486 } 487 488 // Remove the old intrinsic from the chain 489 DAG.ReplaceAllUsesOfValueWith( 490 SDValue(Intr, Intr->getNumValues() - 1), 491 Intr->getOperand(0)); 492 493 return Chain; 494} 495 496SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 497 SDValue LHS = Op.getOperand(0); 498 SDValue RHS = Op.getOperand(1); 499 SDValue True = Op.getOperand(2); 500 SDValue False = Op.getOperand(3); 501 SDValue CC = Op.getOperand(4); 502 EVT VT = Op.getValueType(); 503 SDLoc DL(Op); 504 505 // Possible Min/Max pattern 506 SDValue MinMax = LowerMinMax(Op, DAG); 507 if (MinMax.getNode()) { 508 return MinMax; 509 } 510 511 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 512 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 513} 514 515SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 516 SelectionDAG &DAG) const { 517 EVT VT = Op.getValueType(); 518 SDLoc DL(Op); 519 520 if (VT != MVT::i64) { 521 return SDValue(); 522 } 523 524 SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 525 DAG.getConstant(31, MVT::i32)); 526 527 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 528} 529 530//===----------------------------------------------------------------------===// 531// Custom DAG optimizations 532//===----------------------------------------------------------------------===// 533 534SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 535 DAGCombinerInfo &DCI) const { 536 SelectionDAG &DAG = DCI.DAG; 537 SDLoc DL(N); 538 EVT VT = N->getValueType(0); 539 540 switch (N->getOpcode()) { 541 default: break; 542 case ISD::SELECT_CC: { 543 N->dump(); 544 ConstantSDNode *True, *False; 545 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 546 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 547 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 548 && True->isAllOnesValue() 549 && False->isNullValue() 550 && VT == MVT::i1) { 551 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 552 N->getOperand(1), N->getOperand(4)); 553 554 } 555 break; 556 } 557 case ISD::SETCC: { 558 SDValue Arg0 = N->getOperand(0); 559 SDValue Arg1 = N->getOperand(1); 560 SDValue CC = N->getOperand(2); 561 ConstantSDNode * C = NULL; 562 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 563 564 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 565 if (VT == MVT::i1 566 && Arg0.getOpcode() == ISD::SIGN_EXTEND 567 && Arg0.getOperand(0).getValueType() == MVT::i1 568 && (C = dyn_cast<ConstantSDNode>(Arg1)) 569 && C->isNullValue() 570 && CCOp == ISD::SETNE) { 571 return SimplifySetCC(VT, Arg0.getOperand(0), 572 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 573 } 574 break; 575 } 576 } 577 return SDValue(); 578} 579 580/// \brief Test if RegClass is one of the VSrc classes 581static bool isVSrc(unsigned RegClass) { 582 return AMDGPU::VSrc_32RegClassID == RegClass || 583 AMDGPU::VSrc_64RegClassID == RegClass; 584} 585 586/// \brief Test if RegClass is one of the SSrc classes 587static bool isSSrc(unsigned RegClass) { 588 return AMDGPU::SSrc_32RegClassID == RegClass || 589 AMDGPU::SSrc_64RegClassID == RegClass; 590} 591 592/// \brief Analyze the possible immediate value Op 593/// 594/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 595/// and the immediate value if it's a literal immediate 596int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 597 598 union { 599 int32_t I; 600 float F; 601 } Imm; 602 603 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 604 if (Node->getZExtValue() >> 32) { 605 return -1; 606 } 607 Imm.I = Node->getSExtValue(); 608 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 609 Imm.F = Node->getValueAPF().convertToFloat(); 610 else 611 return -1; // It isn't an immediate 612 613 if ((Imm.I >= -16 && Imm.I <= 64) || 614 Imm.F == 0.5f || Imm.F == -0.5f || 615 Imm.F == 1.0f || Imm.F == -1.0f || 616 Imm.F == 2.0f || Imm.F == -2.0f || 617 Imm.F == 4.0f || Imm.F == -4.0f) 618 return 0; // It's an inline immediate 619 620 return Imm.I; // It's a literal immediate 621} 622 623/// \brief Try to fold an immediate directly into an instruction 624bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 625 bool &ScalarSlotUsed) const { 626 627 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 628 const SIInstrInfo *TII = 629 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 630 if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 631 return false; 632 633 const SDValue &Op = Mov->getOperand(0); 634 int32_t Value = analyzeImmediate(Op.getNode()); 635 if (Value == -1) { 636 // Not an immediate at all 637 return false; 638 639 } else if (Value == 0) { 640 // Inline immediates can always be fold 641 Operand = Op; 642 return true; 643 644 } else if (Value == Immediate) { 645 // Already fold literal immediate 646 Operand = Op; 647 return true; 648 649 } else if (!ScalarSlotUsed && !Immediate) { 650 // Fold this literal immediate 651 ScalarSlotUsed = true; 652 Immediate = Value; 653 Operand = Op; 654 return true; 655 656 } 657 658 return false; 659} 660 661/// \brief Does "Op" fit into register class "RegClass" ? 662bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 663 unsigned RegClass) const { 664 665 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 666 SDNode *Node = Op.getNode(); 667 668 const TargetRegisterClass *OpClass; 669 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 670 if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { 671 const SIInstrInfo *TII = 672 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 673 const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); 674 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 675 if (OpClassID == -1) { 676 switch (MN->getMachineOpcode()) { 677 case AMDGPU::REG_SEQUENCE: 678 // Operand 0 is the register class id for REG_SEQUENCE instructions. 679 OpClass = TRI->getRegClass( 680 cast<ConstantSDNode>(MN->getOperand(0))->getZExtValue()); 681 break; 682 default: 683 OpClass = getRegClassFor(Op.getSimpleValueType()); 684 break; 685 } 686 } else { 687 OpClass = TRI->getRegClass(OpClassID); 688 } 689 690 } else if (Node->getOpcode() == ISD::CopyFromReg) { 691 RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); 692 OpClass = MRI.getRegClass(Reg->getReg()); 693 694 } else 695 return false; 696 697 return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass); 698} 699 700/// \brief Make sure that we don't exeed the number of allowed scalars 701void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 702 unsigned RegClass, 703 bool &ScalarSlotUsed) const { 704 705 // First map the operands register class to a destination class 706 if (RegClass == AMDGPU::VSrc_32RegClassID) 707 RegClass = AMDGPU::VReg_32RegClassID; 708 else if (RegClass == AMDGPU::VSrc_64RegClassID) 709 RegClass = AMDGPU::VReg_64RegClassID; 710 else 711 return; 712 713 // Nothing todo if they fit naturaly 714 if (fitsRegClass(DAG, Operand, RegClass)) 715 return; 716 717 // If the scalar slot isn't used yet use it now 718 if (!ScalarSlotUsed) { 719 ScalarSlotUsed = true; 720 return; 721 } 722 723 // This is a conservative aproach, it is possible that we can't determine 724 // the correct register class and copy too often, but better save than sorry. 725 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 726 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 727 Operand.getValueType(), Operand, RC); 728 Operand = SDValue(Node, 0); 729} 730 731/// \returns true if \p Node's operands are different from the SDValue list 732/// \p Ops 733static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 734 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 735 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 736 return true; 737 } 738 } 739 return false; 740} 741 742/// \brief Try to fold the Nodes operands into the Node 743SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 744 SelectionDAG &DAG) const { 745 746 // Original encoding (either e32 or e64) 747 int Opcode = Node->getMachineOpcode(); 748 const SIInstrInfo *TII = 749 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 750 const MCInstrDesc *Desc = &TII->get(Opcode); 751 752 unsigned NumDefs = Desc->getNumDefs(); 753 unsigned NumOps = Desc->getNumOperands(); 754 755 // Commuted opcode if available 756 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 757 const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 758 759 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 760 assert(!DescRev || DescRev->getNumOperands() == NumOps); 761 762 // e64 version if available, -1 otherwise 763 int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 764 const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 765 766 assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 767 assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 768 769 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 770 bool HaveVSrc = false, HaveSSrc = false; 771 772 // First figure out what we alread have in this instruction 773 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 774 i != e && Op < NumOps; ++i, ++Op) { 775 776 unsigned RegClass = Desc->OpInfo[Op].RegClass; 777 if (isVSrc(RegClass)) 778 HaveVSrc = true; 779 else if (isSSrc(RegClass)) 780 HaveSSrc = true; 781 else 782 continue; 783 784 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 785 if (Imm != -1 && Imm != 0) { 786 // Literal immediate 787 Immediate = Imm; 788 } 789 } 790 791 // If we neither have VSrc nor SSrc it makes no sense to continue 792 if (!HaveVSrc && !HaveSSrc) 793 return Node; 794 795 // No scalar allowed when we have both VSrc and SSrc 796 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 797 798 // Second go over the operands and try to fold them 799 std::vector<SDValue> Ops; 800 bool Promote2e64 = false; 801 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 802 i != e && Op < NumOps; ++i, ++Op) { 803 804 const SDValue &Operand = Node->getOperand(i); 805 Ops.push_back(Operand); 806 807 // Already folded immediate ? 808 if (isa<ConstantSDNode>(Operand.getNode()) || 809 isa<ConstantFPSDNode>(Operand.getNode())) 810 continue; 811 812 // Is this a VSrc or SSrc operand ? 813 unsigned RegClass = Desc->OpInfo[Op].RegClass; 814 if (isVSrc(RegClass) || isSSrc(RegClass)) { 815 // Try to fold the immediates 816 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 817 // Folding didn't worked, make sure we don't hit the SReg limit 818 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 819 } 820 continue; 821 } 822 823 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 824 825 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 826 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 827 828 // Test if it makes sense to swap operands 829 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 830 (!fitsRegClass(DAG, Ops[1], RegClass) && 831 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 832 833 // Swap commutable operands 834 SDValue Tmp = Ops[1]; 835 Ops[1] = Ops[0]; 836 Ops[0] = Tmp; 837 838 Desc = DescRev; 839 DescRev = 0; 840 continue; 841 } 842 } 843 844 if (DescE64 && !Immediate) { 845 846 // Test if it makes sense to switch to e64 encoding 847 unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 848 if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 849 continue; 850 851 int32_t TmpImm = -1; 852 if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 853 (!fitsRegClass(DAG, Ops[i], RegClass) && 854 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 855 856 // Switch to e64 encoding 857 Immediate = -1; 858 Promote2e64 = true; 859 Desc = DescE64; 860 DescE64 = 0; 861 } 862 } 863 } 864 865 if (Promote2e64) { 866 // Add the modifier flags while promoting 867 for (unsigned i = 0; i < 4; ++i) 868 Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 869 } 870 871 // Add optional chain and glue 872 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 873 Ops.push_back(Node->getOperand(i)); 874 875 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 876 // this case a brand new node is always be created, even if the operands 877 // are the same as before. So, manually check if anything has been changed. 878 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 879 return Node; 880 } 881 882 // Create a complete new instruction 883 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 884} 885 886/// \brief Helper function for adjustWritemask 887static unsigned SubIdx2Lane(unsigned Idx) { 888 switch (Idx) { 889 default: return 0; 890 case AMDGPU::sub0: return 0; 891 case AMDGPU::sub1: return 1; 892 case AMDGPU::sub2: return 2; 893 case AMDGPU::sub3: return 3; 894 } 895} 896 897/// \brief Adjust the writemask of MIMG instructions 898void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 899 SelectionDAG &DAG) const { 900 SDNode *Users[4] = { }; 901 unsigned Writemask = 0, Lane = 0; 902 903 // Try to figure out the used register components 904 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 905 I != E; ++I) { 906 907 // Abort if we can't understand the usage 908 if (!I->isMachineOpcode() || 909 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 910 return; 911 912 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 913 914 // Abort if we have more than one user per component 915 if (Users[Lane]) 916 return; 917 918 Users[Lane] = *I; 919 Writemask |= 1 << Lane; 920 } 921 922 // Abort if all components are used 923 if (Writemask == 0xf) 924 return; 925 926 // Adjust the writemask in the node 927 std::vector<SDValue> Ops; 928 Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); 929 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 930 Ops.push_back(Node->getOperand(i)); 931 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 932 933 // If we only got one lane, replace it with a copy 934 if (Writemask == (1U << Lane)) { 935 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 936 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 937 SDLoc(), Users[Lane]->getValueType(0), 938 SDValue(Node, 0), RC); 939 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 940 return; 941 } 942 943 // Update the users of the node with the new indices 944 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 945 946 SDNode *User = Users[i]; 947 if (!User) 948 continue; 949 950 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 951 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 952 953 switch (Idx) { 954 default: break; 955 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 956 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 957 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 958 } 959 } 960} 961 962/// \brief Fold the instructions after slecting them 963SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 964 SelectionDAG &DAG) const { 965 Node = AdjustRegClass(Node, DAG); 966 967 if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) 968 adjustWritemask(Node, DAG); 969 970 return foldOperands(Node, DAG); 971} 972 973/// \brief Assign the register class depending on the number of 974/// bits set in the writemask 975void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 976 SDNode *Node) const { 977 if (AMDGPU::isMIMG(MI->getOpcode()) == -1) 978 return; 979 980 unsigned VReg = MI->getOperand(0).getReg(); 981 unsigned Writemask = MI->getOperand(1).getImm(); 982 unsigned BitsSet = 0; 983 for (unsigned i = 0; i < 4; ++i) 984 BitsSet += Writemask & (1 << i) ? 1 : 0; 985 986 const TargetRegisterClass *RC; 987 switch (BitsSet) { 988 default: return; 989 case 1: RC = &AMDGPU::VReg_32RegClass; break; 990 case 2: RC = &AMDGPU::VReg_64RegClass; break; 991 case 3: RC = &AMDGPU::VReg_96RegClass; break; 992 } 993 994 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 995 MRI.setRegClass(VReg, RC); 996} 997 998MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 999 SelectionDAG &DAG) const { 1000 1001 SDLoc DL(N); 1002 unsigned NewOpcode = N->getMachineOpcode(); 1003 1004 switch (N->getMachineOpcode()) { 1005 default: return N; 1006 case AMDGPU::REG_SEQUENCE: { 1007 // MVT::i128 only use SGPRs, so i128 REG_SEQUENCEs don't need to be 1008 // rewritten. 1009 if (N->getValueType(0) == MVT::i128) { 1010 return N; 1011 } 1012 const SDValue Ops[] = { 1013 DAG.getTargetConstant(AMDGPU::VReg_64RegClassID, MVT::i32), 1014 N->getOperand(1) , N->getOperand(2), 1015 N->getOperand(3), N->getOperand(4) 1016 }; 1017 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::i64, Ops); 1018 } 1019 1020 case AMDGPU::S_LOAD_DWORD_IMM: 1021 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1022 // Fall-through 1023 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1024 if (NewOpcode == N->getMachineOpcode()) { 1025 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1026 } 1027 // Fall-through 1028 case AMDGPU::S_LOAD_DWORDX4_IMM: 1029 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1030 if (NewOpcode == N->getMachineOpcode()) { 1031 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1032 } 1033 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1034 return N; 1035 } 1036 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1037 SDValue Ops[] = { 1038 SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1039 DAG.getConstant(0, MVT::i64)), 0), 1040 N->getOperand(0), 1041 DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1042 }; 1043 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1044 } 1045 } 1046} 1047 1048SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1049 const TargetRegisterClass *RC, 1050 unsigned Reg, EVT VT) const { 1051 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1052 1053 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1054 cast<RegisterSDNode>(VReg)->getReg(), VT); 1055} 1056