SIISelLowering.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#include "SIISelLowering.h" 16#include "AMDGPU.h" 17#include "AMDGPUSubtarget.h" 18#include "AMDILIntrinsicInfo.h" 19#include "SIInstrInfo.h" 20#include "SIMachineFunctionInfo.h" 21#include "SIRegisterInfo.h" 22#include "llvm/CodeGen/CallingConvLower.h" 23#include "llvm/CodeGen/MachineInstrBuilder.h" 24#include "llvm/CodeGen/MachineRegisterInfo.h" 25#include "llvm/CodeGen/SelectionDAG.h" 26#include "llvm/IR/Function.h" 27 28using namespace llvm; 29 30SITargetLowering::SITargetLowering(TargetMachine &TM) : 31 AMDGPUTargetLowering(TM) { 32 addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 33 addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); 34 35 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 36 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 37 38 addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); 39 addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); 40 41 addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); 42 addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); 43 addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); 44 45 addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 46 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 47 addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 48 49 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 50 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 51 52 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 53 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 54 55 computeRegisterProperties(); 56 57 // Condition Codes 58 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 59 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 60 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 61 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 62 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 63 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 64 65 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 66 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 67 setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); 68 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 69 setCondCodeAction(ISD::SETULE, MVT::f64, Expand); 70 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 71 72 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 73 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 74 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 75 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 76 77 setOperationAction(ISD::ADD, MVT::i32, Legal); 78 setOperationAction(ISD::ADDC, MVT::i32, Legal); 79 setOperationAction(ISD::ADDE, MVT::i32, Legal); 80 81 setOperationAction(ISD::BITCAST, MVT::i128, Legal); 82 83 // We need to custom lower vector stores from local memory 84 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 87 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 88 89 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 90 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 91 92 // We need to custom lower loads/stores from private memory 93 setOperationAction(ISD::LOAD, MVT::i32, Custom); 94 setOperationAction(ISD::LOAD, MVT::i64, Custom); 95 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 96 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 97 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 98 99 setOperationAction(ISD::STORE, MVT::i1, Custom); 100 setOperationAction(ISD::STORE, MVT::i32, Custom); 101 setOperationAction(ISD::STORE, MVT::i64, Custom); 102 setOperationAction(ISD::STORE, MVT::i128, Custom); 103 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 104 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 105 106 setOperationAction(ISD::SELECT, MVT::i64, Custom); 107 setOperationAction(ISD::SELECT, MVT::f64, Promote); 108 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 109 110 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 111 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 112 113 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 114 115 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 116 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 117 118 setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); 119 setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 120 setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); 121 122 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 126 127 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 128 129 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 130 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 131 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 132 setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); 133 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 134 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 135 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); 136 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); 137 138 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 139 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 140 setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); 141 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 142 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 143 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 144 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 145 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 146 setTruncStoreAction(MVT::i128, MVT::i64, Expand); 147 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 148 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 149 150 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 151 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 152 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 153 154 // We only support LOAD/STORE and vector manipulation ops for vectors 155 // with > 4 elements. 156 MVT VecTypes[] = { 157 MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 158 }; 159 160 const size_t NumVecTypes = array_lengthof(VecTypes); 161 for (unsigned Type = 0; Type < NumVecTypes; ++Type) { 162 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 163 switch(Op) { 164 case ISD::LOAD: 165 case ISD::STORE: 166 case ISD::BUILD_VECTOR: 167 case ISD::BITCAST: 168 case ISD::EXTRACT_VECTOR_ELT: 169 case ISD::INSERT_VECTOR_ELT: 170 case ISD::CONCAT_VECTORS: 171 case ISD::INSERT_SUBVECTOR: 172 case ISD::EXTRACT_SUBVECTOR: 173 break; 174 default: 175 setOperationAction(Op, VecTypes[Type], Expand); 176 break; 177 } 178 } 179 } 180 181 for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { 182 MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); 183 setOperationAction(ISD::FTRUNC, VT, Expand); 184 setOperationAction(ISD::FCEIL, VT, Expand); 185 setOperationAction(ISD::FFLOOR, VT, Expand); 186 } 187 188 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 189 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 190 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 191 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 192 } 193 194 setTargetDAGCombine(ISD::SELECT_CC); 195 setTargetDAGCombine(ISD::SETCC); 196 197 setSchedulingPreference(Sched::RegPressure); 198} 199 200//===----------------------------------------------------------------------===// 201// TargetLowering queries 202//===----------------------------------------------------------------------===// 203 204bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 205 unsigned AddrSpace, 206 bool *IsFast) const { 207 // XXX: This depends on the address space and also we may want to revist 208 // the alignment values we specify in the DataLayout. 209 if (!VT.isSimple() || VT == MVT::Other) 210 return false; 211 return VT.bitsGT(MVT::i32); 212} 213 214bool SITargetLowering::shouldSplitVectorType(EVT VT) const { 215 return VT.getScalarType().bitsLE(MVT::i16); 216} 217 218bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 219 Type *Ty) const { 220 const SIInstrInfo *TII = 221 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 222 return TII->isInlineConstant(Imm); 223} 224 225SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 226 SDLoc DL, SDValue Chain, 227 unsigned Offset) const { 228 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 229 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 230 AMDGPUAS::CONSTANT_ADDRESS); 231 SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 232 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 233 SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 234 DAG.getConstant(Offset, MVT::i64)); 235 return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, 236 MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, 237 false, false, MemVT.getSizeInBits() >> 3); 238 239} 240 241SDValue SITargetLowering::LowerFormalArguments( 242 SDValue Chain, 243 CallingConv::ID CallConv, 244 bool isVarArg, 245 const SmallVectorImpl<ISD::InputArg> &Ins, 246 SDLoc DL, SelectionDAG &DAG, 247 SmallVectorImpl<SDValue> &InVals) const { 248 249 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 250 251 MachineFunction &MF = DAG.getMachineFunction(); 252 FunctionType *FType = MF.getFunction()->getFunctionType(); 253 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 254 255 assert(CallConv == CallingConv::C); 256 257 SmallVector<ISD::InputArg, 16> Splits; 258 uint32_t Skipped = 0; 259 260 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 261 const ISD::InputArg &Arg = Ins[i]; 262 263 // First check if it's a PS input addr 264 if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && 265 !Arg.Flags.isByVal()) { 266 267 assert((PSInputNum <= 15) && "Too many PS inputs!"); 268 269 if (!Arg.Used) { 270 // We can savely skip PS inputs 271 Skipped |= 1 << i; 272 ++PSInputNum; 273 continue; 274 } 275 276 Info->PSInputAddr |= 1 << PSInputNum++; 277 } 278 279 // Second split vertices into their elements 280 if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 281 ISD::InputArg NewArg = Arg; 282 NewArg.Flags.setSplit(); 283 NewArg.VT = Arg.VT.getVectorElementType(); 284 285 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 286 // three or five element vertex only needs three or five registers, 287 // NOT four or eigth. 288 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 289 unsigned NumElements = ParamType->getVectorNumElements(); 290 291 for (unsigned j = 0; j != NumElements; ++j) { 292 Splits.push_back(NewArg); 293 NewArg.PartOffset += NewArg.VT.getStoreSize(); 294 } 295 296 } else if (Info->ShaderType != ShaderType::COMPUTE) { 297 Splits.push_back(Arg); 298 } 299 } 300 301 SmallVector<CCValAssign, 16> ArgLocs; 302 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 303 getTargetMachine(), ArgLocs, *DAG.getContext()); 304 305 // At least one interpolation mode must be enabled or else the GPU will hang. 306 if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 307 Info->PSInputAddr |= 1; 308 CCInfo.AllocateReg(AMDGPU::VGPR0); 309 CCInfo.AllocateReg(AMDGPU::VGPR1); 310 } 311 312 // The pointer to the list of arguments is stored in SGPR0, SGPR1 313 if (Info->ShaderType == ShaderType::COMPUTE) { 314 CCInfo.AllocateReg(AMDGPU::SGPR0); 315 CCInfo.AllocateReg(AMDGPU::SGPR1); 316 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 317 } 318 319 if (Info->ShaderType == ShaderType::COMPUTE) { 320 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 321 Splits); 322 } 323 324 AnalyzeFormalArguments(CCInfo, Splits); 325 326 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 327 328 const ISD::InputArg &Arg = Ins[i]; 329 if (Skipped & (1 << i)) { 330 InVals.push_back(DAG.getUNDEF(Arg.VT)); 331 continue; 332 } 333 334 CCValAssign &VA = ArgLocs[ArgIdx++]; 335 EVT VT = VA.getLocVT(); 336 337 if (VA.isMemLoc()) { 338 VT = Ins[i].VT; 339 EVT MemVT = Splits[i].VT; 340 // The first 36 bytes of the input buffer contains information about 341 // thread group and global sizes. 342 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 343 36 + VA.getLocMemOffset()); 344 InVals.push_back(Arg); 345 continue; 346 } 347 assert(VA.isRegLoc() && "Parameter must be in a register!"); 348 349 unsigned Reg = VA.getLocReg(); 350 351 if (VT == MVT::i64) { 352 // For now assume it is a pointer 353 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 354 &AMDGPU::SReg_64RegClass); 355 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 356 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 357 continue; 358 } 359 360 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 361 362 Reg = MF.addLiveIn(Reg, RC); 363 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 364 365 if (Arg.VT.isVector()) { 366 367 // Build a vector from the registers 368 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 369 unsigned NumElements = ParamType->getVectorNumElements(); 370 371 SmallVector<SDValue, 4> Regs; 372 Regs.push_back(Val); 373 for (unsigned j = 1; j != NumElements; ++j) { 374 Reg = ArgLocs[ArgIdx++].getLocReg(); 375 Reg = MF.addLiveIn(Reg, RC); 376 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 377 } 378 379 // Fill up the missing vector elements 380 NumElements = Arg.VT.getVectorNumElements() - NumElements; 381 for (unsigned j = 0; j != NumElements; ++j) 382 Regs.push_back(DAG.getUNDEF(VT)); 383 384 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 385 Regs.data(), Regs.size())); 386 continue; 387 } 388 389 InVals.push_back(Val); 390 } 391 return Chain; 392} 393 394MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 395 MachineInstr * MI, MachineBasicBlock * BB) const { 396 397 MachineBasicBlock::iterator I = *MI; 398 399 switch (MI->getOpcode()) { 400 default: 401 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 402 case AMDGPU::BRANCH: return BB; 403 case AMDGPU::SI_ADDR64_RSRC: { 404 const SIInstrInfo *TII = 405 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 406 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 407 unsigned SuperReg = MI->getOperand(0).getReg(); 408 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 409 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 410 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 411 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 412 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 413 .addOperand(MI->getOperand(1)); 414 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 415 .addImm(0); 416 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 417 .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); 418 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 419 .addReg(SubRegHiLo) 420 .addImm(AMDGPU::sub0) 421 .addReg(SubRegHiHi) 422 .addImm(AMDGPU::sub1); 423 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 424 .addReg(SubRegLo) 425 .addImm(AMDGPU::sub0_sub1) 426 .addReg(SubRegHi) 427 .addImm(AMDGPU::sub2_sub3); 428 MI->eraseFromParent(); 429 break; 430 } 431 case AMDGPU::V_SUB_F64: { 432 const SIInstrInfo *TII = 433 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 434 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 435 MI->getOperand(0).getReg()) 436 .addReg(MI->getOperand(1).getReg()) 437 .addReg(MI->getOperand(2).getReg()) 438 .addImm(0) /* src2 */ 439 .addImm(0) /* ABS */ 440 .addImm(0) /* CLAMP */ 441 .addImm(0) /* OMOD */ 442 .addImm(2); /* NEG */ 443 MI->eraseFromParent(); 444 break; 445 } 446 case AMDGPU::SI_RegisterStorePseudo: { 447 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 448 const SIInstrInfo *TII = 449 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 450 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 451 MachineInstrBuilder MIB = 452 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 453 Reg); 454 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 455 MIB.addOperand(MI->getOperand(i)); 456 457 MI->eraseFromParent(); 458 } 459 } 460 return BB; 461} 462 463EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 464 if (!VT.isVector()) { 465 return MVT::i1; 466 } 467 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 468} 469 470MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 471 return MVT::i32; 472} 473 474bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 475 VT = VT.getScalarType(); 476 477 if (!VT.isSimple()) 478 return false; 479 480 switch (VT.getSimpleVT().SimpleTy) { 481 case MVT::f32: 482 return false; /* There is V_MAD_F32 for f32 */ 483 case MVT::f64: 484 return true; 485 default: 486 break; 487 } 488 489 return false; 490} 491 492//===----------------------------------------------------------------------===// 493// Custom DAG Lowering Operations 494//===----------------------------------------------------------------------===// 495 496SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 497 MachineFunction &MF = DAG.getMachineFunction(); 498 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 499 switch (Op.getOpcode()) { 500 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 501 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 502 case ISD::LOAD: { 503 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 504 if (Op.getValueType().isVector() && 505 (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 506 Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 507 (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 508 Op.getValueType().getVectorNumElements() > 4))) { 509 SDValue MergedValues[2] = { 510 SplitVectorLoad(Op, DAG), 511 Load->getChain() 512 }; 513 return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); 514 } else { 515 return LowerLOAD(Op, DAG); 516 } 517 } 518 519 case ISD::SELECT: return LowerSELECT(Op, DAG); 520 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 521 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 522 case ISD::STORE: return LowerSTORE(Op, DAG); 523 case ISD::ANY_EXTEND: // Fall-through 524 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); 525 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 526 case ISD::INTRINSIC_WO_CHAIN: { 527 unsigned IntrinsicID = 528 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 529 EVT VT = Op.getValueType(); 530 SDLoc DL(Op); 531 //XXX: Hardcoded we only use two to store the pointer to the parameters. 532 unsigned NumUserSGPRs = 2; 533 switch (IntrinsicID) { 534 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 535 case Intrinsic::r600_read_ngroups_x: 536 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); 537 case Intrinsic::r600_read_ngroups_y: 538 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); 539 case Intrinsic::r600_read_ngroups_z: 540 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); 541 case Intrinsic::r600_read_global_size_x: 542 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); 543 case Intrinsic::r600_read_global_size_y: 544 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); 545 case Intrinsic::r600_read_global_size_z: 546 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); 547 case Intrinsic::r600_read_local_size_x: 548 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); 549 case Intrinsic::r600_read_local_size_y: 550 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); 551 case Intrinsic::r600_read_local_size_z: 552 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); 553 case Intrinsic::r600_read_tgid_x: 554 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 555 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 556 case Intrinsic::r600_read_tgid_y: 557 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 558 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 559 case Intrinsic::r600_read_tgid_z: 560 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 561 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 562 case Intrinsic::r600_read_tidig_x: 563 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 564 AMDGPU::VGPR0, VT); 565 case Intrinsic::r600_read_tidig_y: 566 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 567 AMDGPU::VGPR1, VT); 568 case Intrinsic::r600_read_tidig_z: 569 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 570 AMDGPU::VGPR2, VT); 571 case AMDGPUIntrinsic::SI_load_const: { 572 SDValue Ops [] = { 573 ResourceDescriptorToi128(Op.getOperand(1), DAG), 574 Op.getOperand(2) 575 }; 576 577 MachineMemOperand *MMO = MF.getMachineMemOperand( 578 MachinePointerInfo(), 579 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 580 VT.getSizeInBits() / 8, 4); 581 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 582 Op->getVTList(), Ops, 2, VT, MMO); 583 } 584 case AMDGPUIntrinsic::SI_sample: 585 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 586 case AMDGPUIntrinsic::SI_sampleb: 587 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 588 case AMDGPUIntrinsic::SI_sampled: 589 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 590 case AMDGPUIntrinsic::SI_samplel: 591 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 592 case AMDGPUIntrinsic::SI_vs_load_input: 593 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 594 ResourceDescriptorToi128(Op.getOperand(1), DAG), 595 Op.getOperand(2), 596 Op.getOperand(3)); 597 } 598 } 599 600 case ISD::INTRINSIC_VOID: 601 SDValue Chain = Op.getOperand(0); 602 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 603 604 switch (IntrinsicID) { 605 case AMDGPUIntrinsic::SI_tbuffer_store: { 606 SDLoc DL(Op); 607 SDValue Ops [] = { 608 Chain, 609 ResourceDescriptorToi128(Op.getOperand(2), DAG), 610 Op.getOperand(3), 611 Op.getOperand(4), 612 Op.getOperand(5), 613 Op.getOperand(6), 614 Op.getOperand(7), 615 Op.getOperand(8), 616 Op.getOperand(9), 617 Op.getOperand(10), 618 Op.getOperand(11), 619 Op.getOperand(12), 620 Op.getOperand(13), 621 Op.getOperand(14) 622 }; 623 EVT VT = Op.getOperand(3).getValueType(); 624 625 MachineMemOperand *MMO = MF.getMachineMemOperand( 626 MachinePointerInfo(), 627 MachineMemOperand::MOStore, 628 VT.getSizeInBits() / 8, 4); 629 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 630 Op->getVTList(), Ops, 631 sizeof(Ops)/sizeof(Ops[0]), VT, MMO); 632 } 633 default: 634 break; 635 } 636 } 637 return SDValue(); 638} 639 640/// \brief Helper function for LowerBRCOND 641static SDNode *findUser(SDValue Value, unsigned Opcode) { 642 643 SDNode *Parent = Value.getNode(); 644 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 645 I != E; ++I) { 646 647 if (I.getUse().get() != Value) 648 continue; 649 650 if (I->getOpcode() == Opcode) 651 return *I; 652 } 653 return 0; 654} 655 656/// This transforms the control flow intrinsics to get the branch destination as 657/// last parameter, also switches branch target with BR if the need arise 658SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 659 SelectionDAG &DAG) const { 660 661 SDLoc DL(BRCOND); 662 663 SDNode *Intr = BRCOND.getOperand(1).getNode(); 664 SDValue Target = BRCOND.getOperand(2); 665 SDNode *BR = 0; 666 667 if (Intr->getOpcode() == ISD::SETCC) { 668 // As long as we negate the condition everything is fine 669 SDNode *SetCC = Intr; 670 assert(SetCC->getConstantOperandVal(1) == 1); 671 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 672 ISD::SETNE); 673 Intr = SetCC->getOperand(0).getNode(); 674 675 } else { 676 // Get the target from BR if we don't negate the condition 677 BR = findUser(BRCOND, ISD::BR); 678 Target = BR->getOperand(1); 679 } 680 681 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 682 683 // Build the result and 684 SmallVector<EVT, 4> Res; 685 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 686 Res.push_back(Intr->getValueType(i)); 687 688 // operands of the new intrinsic call 689 SmallVector<SDValue, 4> Ops; 690 Ops.push_back(BRCOND.getOperand(0)); 691 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 692 Ops.push_back(Intr->getOperand(i)); 693 Ops.push_back(Target); 694 695 // build the new intrinsic call 696 SDNode *Result = DAG.getNode( 697 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 698 DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 699 700 if (BR) { 701 // Give the branch instruction our target 702 SDValue Ops[] = { 703 BR->getOperand(0), 704 BRCOND.getOperand(2) 705 }; 706 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 707 } 708 709 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 710 711 // Copy the intrinsic results to registers 712 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 713 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 714 if (!CopyToReg) 715 continue; 716 717 Chain = DAG.getCopyToReg( 718 Chain, DL, 719 CopyToReg->getOperand(1), 720 SDValue(Result, i - 1), 721 SDValue()); 722 723 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 724 } 725 726 // Remove the old intrinsic from the chain 727 DAG.ReplaceAllUsesOfValueWith( 728 SDValue(Intr, Intr->getNumValues() - 1), 729 Intr->getOperand(0)); 730 731 return Chain; 732} 733 734SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 735 SDLoc DL(Op); 736 LoadSDNode *Load = cast<LoadSDNode>(Op); 737 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 738 SDValue MergedValues[2]; 739 MergedValues[1] = Load->getChain(); 740 if (Ret.getNode()) { 741 MergedValues[0] = Ret; 742 return DAG.getMergeValues(MergedValues, 2, DL); 743 } 744 745 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 746 return SDValue(); 747 } 748 749 EVT MemVT = Load->getMemoryVT(); 750 751 assert(!MemVT.isVector() && "Private loads should be scalarized"); 752 assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int"); 753 754 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 755 DAG.getConstant(2, MVT::i32)); 756 Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 757 Load->getChain(), Ptr, 758 DAG.getTargetConstant(0, MVT::i32), 759 Op.getOperand(2)); 760 if (MemVT.getSizeInBits() == 64) { 761 SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 762 DAG.getConstant(1, MVT::i32)); 763 764 SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 765 Load->getChain(), IncPtr, 766 DAG.getTargetConstant(0, MVT::i32), 767 Op.getOperand(2)); 768 769 Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper); 770 } 771 772 MergedValues[0] = Ret; 773 return DAG.getMergeValues(MergedValues, 2, DL); 774 775} 776 777SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, 778 SelectionDAG &DAG) const { 779 780 if (Op.getValueType() == MVT::i128) { 781 return Op; 782 } 783 784 assert(Op.getOpcode() == ISD::UNDEF); 785 786 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, 787 DAG.getConstant(0, MVT::i64), 788 DAG.getConstant(0, MVT::i64)); 789} 790 791SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 792 const SDValue &Op, 793 SelectionDAG &DAG) const { 794 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 795 Op.getOperand(2), 796 ResourceDescriptorToi128(Op.getOperand(3), DAG), 797 Op.getOperand(4)); 798} 799 800SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 801 if (Op.getValueType() != MVT::i64) 802 return SDValue(); 803 804 SDLoc DL(Op); 805 SDValue Cond = Op.getOperand(0); 806 807 SDValue Zero = DAG.getConstant(0, MVT::i32); 808 SDValue One = DAG.getConstant(1, MVT::i32); 809 810 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 811 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 812 813 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 814 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 815 816 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 817 818 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 819 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 820 821 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 822 823 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 824 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 825} 826 827SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 828 SDValue LHS = Op.getOperand(0); 829 SDValue RHS = Op.getOperand(1); 830 SDValue True = Op.getOperand(2); 831 SDValue False = Op.getOperand(3); 832 SDValue CC = Op.getOperand(4); 833 EVT VT = Op.getValueType(); 834 SDLoc DL(Op); 835 836 // Possible Min/Max pattern 837 SDValue MinMax = LowerMinMax(Op, DAG); 838 if (MinMax.getNode()) { 839 return MinMax; 840 } 841 842 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 843 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 844} 845 846SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 847 SelectionDAG &DAG) const { 848 EVT VT = Op.getValueType(); 849 SDLoc DL(Op); 850 851 if (VT != MVT::i64) { 852 return SDValue(); 853 } 854 855 SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 856 DAG.getConstant(31, MVT::i32)); 857 858 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 859} 860 861SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 862 SDLoc DL(Op); 863 StoreSDNode *Store = cast<StoreSDNode>(Op); 864 EVT VT = Store->getMemoryVT(); 865 866 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 867 if (Ret.getNode()) 868 return Ret; 869 870 if (VT.isVector() && VT.getVectorNumElements() >= 8) 871 return SplitVectorStore(Op, DAG); 872 873 if (VT == MVT::i1) 874 return DAG.getTruncStore(Store->getChain(), DL, 875 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 876 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 877 878 if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 879 return SDValue(); 880 881 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(), 882 DAG.getConstant(2, MVT::i32)); 883 SDValue Chain = Store->getChain(); 884 SmallVector<SDValue, 8> Values; 885 886 if (Store->isTruncatingStore()) { 887 unsigned Mask = 0; 888 if (Store->getMemoryVT() == MVT::i8) { 889 Mask = 0xff; 890 } else if (Store->getMemoryVT() == MVT::i16) { 891 Mask = 0xffff; 892 } 893 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 894 Chain, Store->getBasePtr(), 895 DAG.getConstant(0, MVT::i32)); 896 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), 897 DAG.getConstant(0x3, MVT::i32)); 898 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 899 DAG.getConstant(3, MVT::i32)); 900 SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), 901 DAG.getConstant(Mask, MVT::i32)); 902 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 903 MaskedValue, ShiftAmt); 904 SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, 905 DAG.getConstant(32, MVT::i32), ShiftAmt); 906 SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, 907 DAG.getConstant(Mask, MVT::i32), 908 RotrAmt); 909 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 910 Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 911 912 Values.push_back(Dst); 913 } else if (VT == MVT::i64) { 914 for (unsigned i = 0; i < 2; ++i) { 915 Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 916 Store->getValue(), DAG.getConstant(i, MVT::i32))); 917 } 918 } else if (VT == MVT::i128) { 919 for (unsigned i = 0; i < 2; ++i) { 920 for (unsigned j = 0; j < 2; ++j) { 921 Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 922 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, 923 Store->getValue(), DAG.getConstant(i, MVT::i32)), 924 DAG.getConstant(j, MVT::i32))); 925 } 926 } 927 } else { 928 Values.push_back(Store->getValue()); 929 } 930 931 for (unsigned i = 0; i < Values.size(); ++i) { 932 SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, 933 Ptr, DAG.getConstant(i, MVT::i32)); 934 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 935 Chain, Values[i], PartPtr, 936 DAG.getTargetConstant(0, MVT::i32)); 937 } 938 return Chain; 939} 940 941 942SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, 943 SelectionDAG &DAG) const { 944 EVT VT = Op.getValueType(); 945 SDLoc DL(Op); 946 947 if (VT != MVT::i64) { 948 return SDValue(); 949 } 950 951 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), 952 DAG.getConstant(0, MVT::i32)); 953} 954 955//===----------------------------------------------------------------------===// 956// Custom DAG optimizations 957//===----------------------------------------------------------------------===// 958 959SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 960 DAGCombinerInfo &DCI) const { 961 SelectionDAG &DAG = DCI.DAG; 962 SDLoc DL(N); 963 EVT VT = N->getValueType(0); 964 965 switch (N->getOpcode()) { 966 default: break; 967 case ISD::SELECT_CC: { 968 ConstantSDNode *True, *False; 969 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 970 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 971 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 972 && True->isAllOnesValue() 973 && False->isNullValue() 974 && VT == MVT::i1) { 975 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 976 N->getOperand(1), N->getOperand(4)); 977 978 } 979 break; 980 } 981 case ISD::SETCC: { 982 SDValue Arg0 = N->getOperand(0); 983 SDValue Arg1 = N->getOperand(1); 984 SDValue CC = N->getOperand(2); 985 ConstantSDNode * C = NULL; 986 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 987 988 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 989 if (VT == MVT::i1 990 && Arg0.getOpcode() == ISD::SIGN_EXTEND 991 && Arg0.getOperand(0).getValueType() == MVT::i1 992 && (C = dyn_cast<ConstantSDNode>(Arg1)) 993 && C->isNullValue() 994 && CCOp == ISD::SETNE) { 995 return SimplifySetCC(VT, Arg0.getOperand(0), 996 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 997 } 998 break; 999 } 1000 } 1001 return SDValue(); 1002} 1003 1004/// \brief Test if RegClass is one of the VSrc classes 1005static bool isVSrc(unsigned RegClass) { 1006 return AMDGPU::VSrc_32RegClassID == RegClass || 1007 AMDGPU::VSrc_64RegClassID == RegClass; 1008} 1009 1010/// \brief Test if RegClass is one of the SSrc classes 1011static bool isSSrc(unsigned RegClass) { 1012 return AMDGPU::SSrc_32RegClassID == RegClass || 1013 AMDGPU::SSrc_64RegClassID == RegClass; 1014} 1015 1016/// \brief Analyze the possible immediate value Op 1017/// 1018/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1019/// and the immediate value if it's a literal immediate 1020int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1021 1022 union { 1023 int32_t I; 1024 float F; 1025 } Imm; 1026 1027 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1028 if (Node->getZExtValue() >> 32) { 1029 return -1; 1030 } 1031 Imm.I = Node->getSExtValue(); 1032 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 1033 Imm.F = Node->getValueAPF().convertToFloat(); 1034 else 1035 return -1; // It isn't an immediate 1036 1037 if ((Imm.I >= -16 && Imm.I <= 64) || 1038 Imm.F == 0.5f || Imm.F == -0.5f || 1039 Imm.F == 1.0f || Imm.F == -1.0f || 1040 Imm.F == 2.0f || Imm.F == -2.0f || 1041 Imm.F == 4.0f || Imm.F == -4.0f) 1042 return 0; // It's an inline immediate 1043 1044 return Imm.I; // It's a literal immediate 1045} 1046 1047/// \brief Try to fold an immediate directly into an instruction 1048bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 1049 bool &ScalarSlotUsed) const { 1050 1051 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 1052 const SIInstrInfo *TII = 1053 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1054 if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 1055 return false; 1056 1057 const SDValue &Op = Mov->getOperand(0); 1058 int32_t Value = analyzeImmediate(Op.getNode()); 1059 if (Value == -1) { 1060 // Not an immediate at all 1061 return false; 1062 1063 } else if (Value == 0) { 1064 // Inline immediates can always be fold 1065 Operand = Op; 1066 return true; 1067 1068 } else if (Value == Immediate) { 1069 // Already fold literal immediate 1070 Operand = Op; 1071 return true; 1072 1073 } else if (!ScalarSlotUsed && !Immediate) { 1074 // Fold this literal immediate 1075 ScalarSlotUsed = true; 1076 Immediate = Value; 1077 Operand = Op; 1078 return true; 1079 1080 } 1081 1082 return false; 1083} 1084 1085const TargetRegisterClass *SITargetLowering::getRegClassForNode( 1086 SelectionDAG &DAG, const SDValue &Op) const { 1087 const SIInstrInfo *TII = 1088 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1089 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1090 1091 if (!Op->isMachineOpcode()) { 1092 switch(Op->getOpcode()) { 1093 case ISD::CopyFromReg: { 1094 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1095 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 1096 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 1097 return MRI.getRegClass(Reg); 1098 } 1099 return TRI.getPhysRegClass(Reg); 1100 } 1101 default: return NULL; 1102 } 1103 } 1104 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 1105 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 1106 if (OpClassID != -1) { 1107 return TRI.getRegClass(OpClassID); 1108 } 1109 switch(Op.getMachineOpcode()) { 1110 case AMDGPU::COPY_TO_REGCLASS: 1111 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 1112 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 1113 1114 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 1115 // class, then the register class for the value could be either a 1116 // VReg or and SReg. In order to get a more accurate 1117 if (OpClassID == AMDGPU::VSrc_32RegClassID || 1118 OpClassID == AMDGPU::VSrc_64RegClassID) { 1119 return getRegClassForNode(DAG, Op.getOperand(0)); 1120 } 1121 return TRI.getRegClass(OpClassID); 1122 case AMDGPU::EXTRACT_SUBREG: { 1123 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1124 const TargetRegisterClass *SuperClass = 1125 getRegClassForNode(DAG, Op.getOperand(0)); 1126 return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 1127 } 1128 case AMDGPU::REG_SEQUENCE: 1129 // Operand 0 is the register class id for REG_SEQUENCE instructions. 1130 return TRI.getRegClass( 1131 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 1132 default: 1133 return getRegClassFor(Op.getSimpleValueType()); 1134 } 1135} 1136 1137/// \brief Does "Op" fit into register class "RegClass" ? 1138bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 1139 unsigned RegClass) const { 1140 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1141 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 1142 if (!RC) { 1143 return false; 1144 } 1145 return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 1146} 1147 1148/// \brief Make sure that we don't exeed the number of allowed scalars 1149void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 1150 unsigned RegClass, 1151 bool &ScalarSlotUsed) const { 1152 1153 // First map the operands register class to a destination class 1154 if (RegClass == AMDGPU::VSrc_32RegClassID) 1155 RegClass = AMDGPU::VReg_32RegClassID; 1156 else if (RegClass == AMDGPU::VSrc_64RegClassID) 1157 RegClass = AMDGPU::VReg_64RegClassID; 1158 else 1159 return; 1160 1161 // Nothing to do if they fit naturally 1162 if (fitsRegClass(DAG, Operand, RegClass)) 1163 return; 1164 1165 // If the scalar slot isn't used yet use it now 1166 if (!ScalarSlotUsed) { 1167 ScalarSlotUsed = true; 1168 return; 1169 } 1170 1171 // This is a conservative aproach. It is possible that we can't determine the 1172 // correct register class and copy too often, but better safe than sorry. 1173 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 1174 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 1175 Operand.getValueType(), Operand, RC); 1176 Operand = SDValue(Node, 0); 1177} 1178 1179/// \returns true if \p Node's operands are different from the SDValue list 1180/// \p Ops 1181static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 1182 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 1183 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 1184 return true; 1185 } 1186 } 1187 return false; 1188} 1189 1190/// \brief Try to fold the Nodes operands into the Node 1191SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 1192 SelectionDAG &DAG) const { 1193 1194 // Original encoding (either e32 or e64) 1195 int Opcode = Node->getMachineOpcode(); 1196 const SIInstrInfo *TII = 1197 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1198 const MCInstrDesc *Desc = &TII->get(Opcode); 1199 1200 unsigned NumDefs = Desc->getNumDefs(); 1201 unsigned NumOps = Desc->getNumOperands(); 1202 1203 // Commuted opcode if available 1204 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 1205 const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 1206 1207 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 1208 assert(!DescRev || DescRev->getNumOperands() == NumOps); 1209 1210 // e64 version if available, -1 otherwise 1211 int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 1212 const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 1213 1214 assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 1215 assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 1216 1217 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 1218 bool HaveVSrc = false, HaveSSrc = false; 1219 1220 // First figure out what we alread have in this instruction 1221 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1222 i != e && Op < NumOps; ++i, ++Op) { 1223 1224 unsigned RegClass = Desc->OpInfo[Op].RegClass; 1225 if (isVSrc(RegClass)) 1226 HaveVSrc = true; 1227 else if (isSSrc(RegClass)) 1228 HaveSSrc = true; 1229 else 1230 continue; 1231 1232 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 1233 if (Imm != -1 && Imm != 0) { 1234 // Literal immediate 1235 Immediate = Imm; 1236 } 1237 } 1238 1239 // If we neither have VSrc nor SSrc it makes no sense to continue 1240 if (!HaveVSrc && !HaveSSrc) 1241 return Node; 1242 1243 // No scalar allowed when we have both VSrc and SSrc 1244 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 1245 1246 // Second go over the operands and try to fold them 1247 std::vector<SDValue> Ops; 1248 bool Promote2e64 = false; 1249 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1250 i != e && Op < NumOps; ++i, ++Op) { 1251 1252 const SDValue &Operand = Node->getOperand(i); 1253 Ops.push_back(Operand); 1254 1255 // Already folded immediate ? 1256 if (isa<ConstantSDNode>(Operand.getNode()) || 1257 isa<ConstantFPSDNode>(Operand.getNode())) 1258 continue; 1259 1260 // Is this a VSrc or SSrc operand ? 1261 unsigned RegClass = Desc->OpInfo[Op].RegClass; 1262 if (isVSrc(RegClass) || isSSrc(RegClass)) { 1263 // Try to fold the immediates 1264 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 1265 // Folding didn't worked, make sure we don't hit the SReg limit 1266 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 1267 } 1268 continue; 1269 } 1270 1271 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 1272 1273 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 1274 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 1275 1276 // Test if it makes sense to swap operands 1277 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 1278 (!fitsRegClass(DAG, Ops[1], RegClass) && 1279 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1280 1281 // Swap commutable operands 1282 SDValue Tmp = Ops[1]; 1283 Ops[1] = Ops[0]; 1284 Ops[0] = Tmp; 1285 1286 Desc = DescRev; 1287 DescRev = 0; 1288 continue; 1289 } 1290 } 1291 1292 if (DescE64 && !Immediate) { 1293 1294 // Test if it makes sense to switch to e64 encoding 1295 unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 1296 if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 1297 continue; 1298 1299 int32_t TmpImm = -1; 1300 if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 1301 (!fitsRegClass(DAG, Ops[i], RegClass) && 1302 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1303 1304 // Switch to e64 encoding 1305 Immediate = -1; 1306 Promote2e64 = true; 1307 Desc = DescE64; 1308 DescE64 = 0; 1309 } 1310 } 1311 } 1312 1313 if (Promote2e64) { 1314 // Add the modifier flags while promoting 1315 for (unsigned i = 0; i < 4; ++i) 1316 Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 1317 } 1318 1319 // Add optional chain and glue 1320 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 1321 Ops.push_back(Node->getOperand(i)); 1322 1323 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 1324 // this case a brand new node is always be created, even if the operands 1325 // are the same as before. So, manually check if anything has been changed. 1326 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 1327 return Node; 1328 } 1329 1330 // Create a complete new instruction 1331 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 1332} 1333 1334/// \brief Helper function for adjustWritemask 1335static unsigned SubIdx2Lane(unsigned Idx) { 1336 switch (Idx) { 1337 default: return 0; 1338 case AMDGPU::sub0: return 0; 1339 case AMDGPU::sub1: return 1; 1340 case AMDGPU::sub2: return 2; 1341 case AMDGPU::sub3: return 3; 1342 } 1343} 1344 1345/// \brief Adjust the writemask of MIMG instructions 1346void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1347 SelectionDAG &DAG) const { 1348 SDNode *Users[4] = { }; 1349 unsigned Lane = 0; 1350 unsigned OldDmask = Node->getConstantOperandVal(0); 1351 unsigned NewDmask = 0; 1352 1353 // Try to figure out the used register components 1354 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1355 I != E; ++I) { 1356 1357 // Abort if we can't understand the usage 1358 if (!I->isMachineOpcode() || 1359 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1360 return; 1361 1362 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1363 // Note that subregs are packed, i.e. Lane==0 is the first bit set 1364 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1365 // set, etc. 1366 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1367 1368 // Set which texture component corresponds to the lane. 1369 unsigned Comp; 1370 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1371 assert(Dmask); 1372 Comp = countTrailingZeros(Dmask); 1373 Dmask &= ~(1 << Comp); 1374 } 1375 1376 // Abort if we have more than one user per component 1377 if (Users[Lane]) 1378 return; 1379 1380 Users[Lane] = *I; 1381 NewDmask |= 1 << Comp; 1382 } 1383 1384 // Abort if there's no change 1385 if (NewDmask == OldDmask) 1386 return; 1387 1388 // Adjust the writemask in the node 1389 std::vector<SDValue> Ops; 1390 Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 1391 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 1392 Ops.push_back(Node->getOperand(i)); 1393 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 1394 1395 // If we only got one lane, replace it with a copy 1396 // (if NewDmask has only one bit set...) 1397 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1398 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 1399 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1400 SDLoc(), Users[Lane]->getValueType(0), 1401 SDValue(Node, 0), RC); 1402 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1403 return; 1404 } 1405 1406 // Update the users of the node with the new indices 1407 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1408 1409 SDNode *User = Users[i]; 1410 if (!User) 1411 continue; 1412 1413 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1414 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1415 1416 switch (Idx) { 1417 default: break; 1418 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1419 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1420 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1421 } 1422 } 1423} 1424 1425/// \brief Fold the instructions after slecting them 1426SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1427 SelectionDAG &DAG) const { 1428 const SIInstrInfo *TII = 1429 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1430 Node = AdjustRegClass(Node, DAG); 1431 1432 if (TII->isMIMG(Node->getMachineOpcode())) 1433 adjustWritemask(Node, DAG); 1434 1435 return foldOperands(Node, DAG); 1436} 1437 1438/// \brief Assign the register class depending on the number of 1439/// bits set in the writemask 1440void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1441 SDNode *Node) const { 1442 const SIInstrInfo *TII = 1443 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1444 if (!TII->isMIMG(MI->getOpcode())) 1445 return; 1446 1447 unsigned VReg = MI->getOperand(0).getReg(); 1448 unsigned Writemask = MI->getOperand(1).getImm(); 1449 unsigned BitsSet = 0; 1450 for (unsigned i = 0; i < 4; ++i) 1451 BitsSet += Writemask & (1 << i) ? 1 : 0; 1452 1453 const TargetRegisterClass *RC; 1454 switch (BitsSet) { 1455 default: return; 1456 case 1: RC = &AMDGPU::VReg_32RegClass; break; 1457 case 2: RC = &AMDGPU::VReg_64RegClass; break; 1458 case 3: RC = &AMDGPU::VReg_96RegClass; break; 1459 } 1460 1461 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 1462 MI->setDesc(TII->get(NewOpcode)); 1463 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1464 MRI.setRegClass(VReg, RC); 1465} 1466 1467MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1468 SelectionDAG &DAG) const { 1469 1470 SDLoc DL(N); 1471 unsigned NewOpcode = N->getMachineOpcode(); 1472 1473 switch (N->getMachineOpcode()) { 1474 default: return N; 1475 case AMDGPU::S_LOAD_DWORD_IMM: 1476 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1477 // Fall-through 1478 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1479 if (NewOpcode == N->getMachineOpcode()) { 1480 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1481 } 1482 // Fall-through 1483 case AMDGPU::S_LOAD_DWORDX4_IMM: 1484 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1485 if (NewOpcode == N->getMachineOpcode()) { 1486 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1487 } 1488 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1489 return N; 1490 } 1491 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1492 SDValue Ops[] = { 1493 SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1494 DAG.getConstant(0, MVT::i64)), 0), 1495 N->getOperand(0), 1496 DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1497 }; 1498 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1499 } 1500 } 1501} 1502 1503SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1504 const TargetRegisterClass *RC, 1505 unsigned Reg, EVT VT) const { 1506 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1507 1508 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1509 cast<RegisterSDNode>(VReg)->getReg(), VT); 1510} 1511