SIISelLowering.cpp revision 6948897e478cbd66626159776a8017b3c18579b9
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#ifdef _MSC_VER 16// Provide M_PI. 17#define _USE_MATH_DEFINES 18#include <cmath> 19#endif 20 21#include "SIISelLowering.h" 22#include "AMDGPU.h" 23#include "AMDGPUIntrinsicInfo.h" 24#include "AMDGPUSubtarget.h" 25#include "SIInstrInfo.h" 26#include "SIMachineFunctionInfo.h" 27#include "SIRegisterInfo.h" 28#include "llvm/ADT/BitVector.h" 29#include "llvm/CodeGen/CallingConvLower.h" 30#include "llvm/CodeGen/MachineInstrBuilder.h" 31#include "llvm/CodeGen/MachineRegisterInfo.h" 32#include "llvm/CodeGen/SelectionDAG.h" 33#include "llvm/IR/Function.h" 34#include "llvm/ADT/SmallString.h" 35 36using namespace llvm; 37 38SITargetLowering::SITargetLowering(TargetMachine &TM, 39 const AMDGPUSubtarget &STI) 40 : AMDGPUTargetLowering(TM, STI) { 41 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 42 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 43 44 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 45 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 46 47 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 48 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 49 50 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 51 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 52 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 53 54 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 55 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 56 57 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 58 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 59 60 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 61 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 62 63 computeRegisterProperties(STI.getRegisterInfo()); 64 65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 69 70 setOperationAction(ISD::ADD, MVT::i32, Legal); 71 setOperationAction(ISD::ADDC, MVT::i32, Legal); 72 setOperationAction(ISD::ADDE, MVT::i32, Legal); 73 setOperationAction(ISD::SUBC, MVT::i32, Legal); 74 setOperationAction(ISD::SUBE, MVT::i32, Legal); 75 76 setOperationAction(ISD::FSIN, MVT::f32, Custom); 77 setOperationAction(ISD::FCOS, MVT::f32, Custom); 78 79 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 80 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 81 82 // We need to custom lower vector stores from local memory 83 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 84 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 86 87 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 88 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 89 90 setOperationAction(ISD::STORE, MVT::i1, Custom); 91 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 92 93 setOperationAction(ISD::SELECT, MVT::i64, Custom); 94 setOperationAction(ISD::SELECT, MVT::f64, Promote); 95 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 96 97 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 98 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 99 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 100 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 101 102 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 103 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 104 105 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 106 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 110 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 114 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 121 122 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 126 127 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 128 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 129 130 for (MVT VT : MVT::integer_valuetypes()) { 131 if (VT == MVT::i64) 132 continue; 133 134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 137 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 138 139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 142 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 143 144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 146 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 148 } 149 150 for (MVT VT : MVT::integer_vector_valuetypes()) { 151 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); 152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); 153 } 154 155 for (MVT VT : MVT::fp_valuetypes()) 156 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 157 158 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 159 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 160 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 161 162 setOperationAction(ISD::LOAD, MVT::i1, Custom); 163 164 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 165 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 166 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 167 168 // These should use UDIVREM, so set them to expand 169 setOperationAction(ISD::UDIV, MVT::i64, Expand); 170 setOperationAction(ISD::UREM, MVT::i64, Expand); 171 172 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 173 setOperationAction(ISD::SELECT, MVT::i1, Promote); 174 175 // We only support LOAD/STORE and vector manipulation ops for vectors 176 // with > 4 elements. 177 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { 178 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 179 switch(Op) { 180 case ISD::LOAD: 181 case ISD::STORE: 182 case ISD::BUILD_VECTOR: 183 case ISD::BITCAST: 184 case ISD::EXTRACT_VECTOR_ELT: 185 case ISD::INSERT_VECTOR_ELT: 186 case ISD::INSERT_SUBVECTOR: 187 case ISD::EXTRACT_SUBVECTOR: 188 break; 189 case ISD::CONCAT_VECTORS: 190 setOperationAction(Op, VT, Custom); 191 break; 192 default: 193 setOperationAction(Op, VT, Expand); 194 break; 195 } 196 } 197 } 198 199 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 200 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 201 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 202 setOperationAction(ISD::FRINT, MVT::f64, Legal); 203 } 204 205 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 206 setOperationAction(ISD::FDIV, MVT::f32, Custom); 207 setOperationAction(ISD::FDIV, MVT::f64, Custom); 208 209 setTargetDAGCombine(ISD::FADD); 210 setTargetDAGCombine(ISD::FSUB); 211 setTargetDAGCombine(ISD::FMINNUM); 212 setTargetDAGCombine(ISD::FMAXNUM); 213 setTargetDAGCombine(ISD::SMIN); 214 setTargetDAGCombine(ISD::SMAX); 215 setTargetDAGCombine(ISD::UMIN); 216 setTargetDAGCombine(ISD::UMAX); 217 setTargetDAGCombine(ISD::SELECT_CC); 218 setTargetDAGCombine(ISD::SETCC); 219 setTargetDAGCombine(ISD::AND); 220 setTargetDAGCombine(ISD::OR); 221 setTargetDAGCombine(ISD::UINT_TO_FP); 222 223 // All memory operations. Some folding on the pointer operand is done to help 224 // matching the constant offsets in the addressing modes. 225 setTargetDAGCombine(ISD::LOAD); 226 setTargetDAGCombine(ISD::STORE); 227 setTargetDAGCombine(ISD::ATOMIC_LOAD); 228 setTargetDAGCombine(ISD::ATOMIC_STORE); 229 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 230 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 231 setTargetDAGCombine(ISD::ATOMIC_SWAP); 232 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 233 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 234 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 235 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 236 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 237 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 238 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 239 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 240 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 241 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 242 243 setSchedulingPreference(Sched::RegPressure); 244} 245 246//===----------------------------------------------------------------------===// 247// TargetLowering queries 248//===----------------------------------------------------------------------===// 249 250bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 251 EVT) const { 252 // SI has some legal vector types, but no legal vector operations. Say no 253 // shuffles are legal in order to prefer scalarizing some vector operations. 254 return false; 255} 256 257bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, 258 Type *Ty, unsigned AS) const { 259 // No global is ever allowed as a base. 260 if (AM.BaseGV) 261 return false; 262 263 switch (AS) { 264 case AMDGPUAS::GLOBAL_ADDRESS: 265 case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? 266 case AMDGPUAS::PRIVATE_ADDRESS: 267 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { 268 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 269 // additionally can do r + r + i with addr64. 32-bit has more addressing 270 // mode options. Depending on the resource constant, it can also do 271 // (i64 r0) + (i32 r1) * (i14 i). 272 // 273 // SMRD instructions have an 8-bit, dword offset. 274 // 275 // Assume nonunifom access, since the address space isn't enough to know 276 // what instruction we will use, and since we don't know if this is a load 277 // or store and scalar stores are only available on VI. 278 // 279 // We also know if we are doing an extload, we can't do a scalar load. 280 // 281 // Private arrays end up using a scratch buffer most of the time, so also 282 // assume those use MUBUF instructions. Scratch loads / stores are currently 283 // implemented as mubuf instructions with offen bit set, so slightly 284 // different than the normal addr64. 285 if (!isUInt<12>(AM.BaseOffs)) 286 return false; 287 288 // FIXME: Since we can split immediate into soffset and immediate offset, 289 // would it make sense to allow any immediate? 290 291 switch (AM.Scale) { 292 case 0: // r + i or just i, depending on HasBaseReg. 293 return true; 294 case 1: 295 return true; // We have r + r or r + i. 296 case 2: 297 if (AM.HasBaseReg) { 298 // Reject 2 * r + r. 299 return false; 300 } 301 302 // Allow 2 * r as r + r 303 // Or 2 * r + i is allowed as r + r + i. 304 return true; 305 default: // Don't allow n * r 306 return false; 307 } 308 } 309 case AMDGPUAS::LOCAL_ADDRESS: 310 case AMDGPUAS::REGION_ADDRESS: { 311 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 312 // field. 313 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 314 // an 8-bit dword offset but we don't know the alignment here. 315 if (!isUInt<16>(AM.BaseOffs)) 316 return false; 317 318 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 319 return true; 320 321 if (AM.Scale == 1 && AM.HasBaseReg) 322 return true; 323 324 return false; 325 } 326 case AMDGPUAS::FLAT_ADDRESS: { 327 // Flat instructions do not have offsets, and only have the register 328 // address. 329 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 330 } 331 default: 332 llvm_unreachable("unhandled address space"); 333 } 334} 335 336bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 337 unsigned AddrSpace, 338 unsigned Align, 339 bool *IsFast) const { 340 if (IsFast) 341 *IsFast = false; 342 343 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 344 // which isn't a simple VT. 345 if (!VT.isSimple() || VT == MVT::Other) 346 return false; 347 348 // TODO - CI+ supports unaligned memory accesses, but this requires driver 349 // support. 350 351 // XXX - The only mention I see of this in the ISA manual is for LDS direct 352 // reads the "byte address and must be dword aligned". Is it also true for the 353 // normal loads and stores? 354 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 355 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 356 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 357 // with adjacent offsets. 358 return Align % 4 == 0; 359 } 360 361 // Smaller than dword value must be aligned. 362 // FIXME: This should be allowed on CI+ 363 if (VT.bitsLT(MVT::i32)) 364 return false; 365 366 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 367 // byte-address are ignored, thus forcing Dword alignment. 368 // This applies to private, global, and constant memory. 369 if (IsFast) 370 *IsFast = true; 371 372 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 373} 374 375EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 376 unsigned SrcAlign, bool IsMemset, 377 bool ZeroMemset, 378 bool MemcpyStrSrc, 379 MachineFunction &MF) const { 380 // FIXME: Should account for address space here. 381 382 // The default fallback uses the private pointer size as a guess for a type to 383 // use. Make sure we switch these to 64-bit accesses. 384 385 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 386 return MVT::v4i32; 387 388 if (Size >= 8 && DstAlign >= 4) 389 return MVT::v2i32; 390 391 // Use the default. 392 return MVT::Other; 393} 394 395TargetLoweringBase::LegalizeTypeAction 396SITargetLowering::getPreferredVectorAction(EVT VT) const { 397 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 398 return TypeSplitVector; 399 400 return TargetLoweringBase::getPreferredVectorAction(VT); 401} 402 403bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 404 Type *Ty) const { 405 const SIInstrInfo *TII = 406 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 407 return TII->isInlineConstant(Imm); 408} 409 410static EVT toIntegerVT(EVT VT) { 411 if (VT.isVector()) 412 return VT.changeVectorElementTypeToInteger(); 413 return MVT::getIntegerVT(VT.getSizeInBits()); 414} 415 416SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 417 SDLoc SL, SDValue Chain, 418 unsigned Offset, bool Signed) const { 419 const DataLayout *DL = getDataLayout(); 420 MachineFunction &MF = DAG.getMachineFunction(); 421 const SIRegisterInfo *TRI = 422 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); 423 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 424 425 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 426 427 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 428 MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); 429 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 430 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 431 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 432 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 433 DAG.getConstant(Offset, SL, PtrVT)); 434 SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); 435 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 436 437 unsigned Align = DL->getABITypeAlignment(Ty); 438 439 if (VT != MemVT && VT.isFloatingPoint()) { 440 // Do an integer load and convert. 441 // FIXME: This is mostly because load legalization after type legalization 442 // doesn't handle FP extloads. 443 assert(VT.getScalarType() == MVT::f32 && 444 MemVT.getScalarType() == MVT::f16); 445 446 EVT IVT = toIntegerVT(VT); 447 EVT MemIVT = toIntegerVT(MemVT); 448 SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, 449 IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, 450 false, // isVolatile 451 true, // isNonTemporal 452 true, // isInvariant 453 Align); // Alignment 454 return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); 455 } 456 457 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 458 return DAG.getLoad(ISD::UNINDEXED, ExtTy, 459 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 460 false, // isVolatile 461 true, // isNonTemporal 462 true, // isInvariant 463 Align); // Alignment 464} 465 466SDValue SITargetLowering::LowerFormalArguments( 467 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 468 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 469 SmallVectorImpl<SDValue> &InVals) const { 470 const SIRegisterInfo *TRI = 471 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 472 473 MachineFunction &MF = DAG.getMachineFunction(); 474 FunctionType *FType = MF.getFunction()->getFunctionType(); 475 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 476 477 assert(CallConv == CallingConv::C); 478 479 SmallVector<ISD::InputArg, 16> Splits; 480 BitVector Skipped(Ins.size()); 481 482 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 483 const ISD::InputArg &Arg = Ins[i]; 484 485 // First check if it's a PS input addr 486 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 487 !Arg.Flags.isByVal()) { 488 489 assert((PSInputNum <= 15) && "Too many PS inputs!"); 490 491 if (!Arg.Used) { 492 // We can savely skip PS inputs 493 Skipped.set(i); 494 ++PSInputNum; 495 continue; 496 } 497 498 Info->PSInputAddr |= 1 << PSInputNum++; 499 } 500 501 // Second split vertices into their elements 502 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 503 ISD::InputArg NewArg = Arg; 504 NewArg.Flags.setSplit(); 505 NewArg.VT = Arg.VT.getVectorElementType(); 506 507 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 508 // three or five element vertex only needs three or five registers, 509 // NOT four or eigth. 510 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 511 unsigned NumElements = ParamType->getVectorNumElements(); 512 513 for (unsigned j = 0; j != NumElements; ++j) { 514 Splits.push_back(NewArg); 515 NewArg.PartOffset += NewArg.VT.getStoreSize(); 516 } 517 518 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 519 Splits.push_back(Arg); 520 } 521 } 522 523 SmallVector<CCValAssign, 16> ArgLocs; 524 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 525 *DAG.getContext()); 526 527 // At least one interpolation mode must be enabled or else the GPU will hang. 528 if (Info->getShaderType() == ShaderType::PIXEL && 529 (Info->PSInputAddr & 0x7F) == 0) { 530 Info->PSInputAddr |= 1; 531 CCInfo.AllocateReg(AMDGPU::VGPR0); 532 CCInfo.AllocateReg(AMDGPU::VGPR1); 533 } 534 535 // The pointer to the list of arguments is stored in SGPR0, SGPR1 536 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 537 if (Info->getShaderType() == ShaderType::COMPUTE) { 538 if (Subtarget->isAmdHsaOS()) 539 Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. 540 else 541 Info->NumUserSGPRs = 4; 542 543 unsigned InputPtrReg = 544 TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 545 unsigned InputPtrRegLo = 546 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); 547 unsigned InputPtrRegHi = 548 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); 549 550 unsigned ScratchPtrReg = 551 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); 552 unsigned ScratchPtrRegLo = 553 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); 554 unsigned ScratchPtrRegHi = 555 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); 556 557 CCInfo.AllocateReg(InputPtrRegLo); 558 CCInfo.AllocateReg(InputPtrRegHi); 559 CCInfo.AllocateReg(ScratchPtrRegLo); 560 CCInfo.AllocateReg(ScratchPtrRegHi); 561 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 562 MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); 563 } 564 565 if (Info->getShaderType() == ShaderType::COMPUTE) { 566 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 567 Splits); 568 } 569 570 AnalyzeFormalArguments(CCInfo, Splits); 571 572 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 573 574 const ISD::InputArg &Arg = Ins[i]; 575 if (Skipped[i]) { 576 InVals.push_back(DAG.getUNDEF(Arg.VT)); 577 continue; 578 } 579 580 CCValAssign &VA = ArgLocs[ArgIdx++]; 581 MVT VT = VA.getLocVT(); 582 583 if (VA.isMemLoc()) { 584 VT = Ins[i].VT; 585 EVT MemVT = Splits[i].VT; 586 const unsigned Offset = 36 + VA.getLocMemOffset(); 587 // The first 36 bytes of the input buffer contains information about 588 // thread group and global sizes. 589 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 590 Offset, Ins[i].Flags.isSExt()); 591 592 const PointerType *ParamTy = 593 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 594 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 595 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 596 // On SI local pointers are just offsets into LDS, so they are always 597 // less than 16-bits. On CI and newer they could potentially be 598 // real pointers, so we can't guarantee their size. 599 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 600 DAG.getValueType(MVT::i16)); 601 } 602 603 InVals.push_back(Arg); 604 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 605 continue; 606 } 607 assert(VA.isRegLoc() && "Parameter must be in a register!"); 608 609 unsigned Reg = VA.getLocReg(); 610 611 if (VT == MVT::i64) { 612 // For now assume it is a pointer 613 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 614 &AMDGPU::SReg_64RegClass); 615 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 616 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 617 continue; 618 } 619 620 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 621 622 Reg = MF.addLiveIn(Reg, RC); 623 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 624 625 if (Arg.VT.isVector()) { 626 627 // Build a vector from the registers 628 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 629 unsigned NumElements = ParamType->getVectorNumElements(); 630 631 SmallVector<SDValue, 4> Regs; 632 Regs.push_back(Val); 633 for (unsigned j = 1; j != NumElements; ++j) { 634 Reg = ArgLocs[ArgIdx++].getLocReg(); 635 Reg = MF.addLiveIn(Reg, RC); 636 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 637 } 638 639 // Fill up the missing vector elements 640 NumElements = Arg.VT.getVectorNumElements() - NumElements; 641 Regs.append(NumElements, DAG.getUNDEF(VT)); 642 643 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 644 continue; 645 } 646 647 InVals.push_back(Val); 648 } 649 650 if (Info->getShaderType() != ShaderType::COMPUTE) { 651 unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( 652 AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); 653 Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); 654 } 655 return Chain; 656} 657 658MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 659 MachineInstr * MI, MachineBasicBlock * BB) const { 660 661 MachineBasicBlock::iterator I = *MI; 662 const SIInstrInfo *TII = 663 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 664 665 switch (MI->getOpcode()) { 666 default: 667 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 668 case AMDGPU::BRANCH: 669 return BB; 670 case AMDGPU::SI_RegisterStorePseudo: { 671 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 672 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 673 MachineInstrBuilder MIB = 674 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 675 Reg); 676 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 677 MIB.addOperand(MI->getOperand(i)); 678 679 MI->eraseFromParent(); 680 break; 681 } 682 } 683 return BB; 684} 685 686bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 687 // This currently forces unfolding various combinations of fsub into fma with 688 // free fneg'd operands. As long as we have fast FMA (controlled by 689 // isFMAFasterThanFMulAndFAdd), we should perform these. 690 691 // When fma is quarter rate, for f64 where add / sub are at best half rate, 692 // most of these combines appear to be cycle neutral but save on instruction 693 // count / code size. 694 return true; 695} 696 697EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { 698 if (!VT.isVector()) { 699 return MVT::i1; 700 } 701 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 702} 703 704MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 705 return MVT::i32; 706} 707 708// Answering this is somewhat tricky and depends on the specific device which 709// have different rates for fma or all f64 operations. 710// 711// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 712// regardless of which device (although the number of cycles differs between 713// devices), so it is always profitable for f64. 714// 715// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 716// only on full rate devices. Normally, we should prefer selecting v_mad_f32 717// which we can always do even without fused FP ops since it returns the same 718// result as the separate operations and since it is always full 719// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 720// however does not support denormals, so we do report fma as faster if we have 721// a fast fma device and require denormals. 722// 723bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 724 VT = VT.getScalarType(); 725 726 if (!VT.isSimple()) 727 return false; 728 729 switch (VT.getSimpleVT().SimpleTy) { 730 case MVT::f32: 731 // This is as fast on some subtargets. However, we always have full rate f32 732 // mad available which returns the same result as the separate operations 733 // which we should prefer over fma. We can't use this if we want to support 734 // denormals, so only report this in these cases. 735 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 736 case MVT::f64: 737 return true; 738 default: 739 break; 740 } 741 742 return false; 743} 744 745//===----------------------------------------------------------------------===// 746// Custom DAG Lowering Operations 747//===----------------------------------------------------------------------===// 748 749SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 750 switch (Op.getOpcode()) { 751 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 752 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 753 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 754 case ISD::LOAD: { 755 SDValue Result = LowerLOAD(Op, DAG); 756 assert((!Result.getNode() || 757 Result.getNode()->getNumValues() == 2) && 758 "Load should return a value and a chain"); 759 return Result; 760 } 761 762 case ISD::FSIN: 763 case ISD::FCOS: 764 return LowerTrig(Op, DAG); 765 case ISD::SELECT: return LowerSELECT(Op, DAG); 766 case ISD::FDIV: return LowerFDIV(Op, DAG); 767 case ISD::STORE: return LowerSTORE(Op, DAG); 768 case ISD::GlobalAddress: { 769 MachineFunction &MF = DAG.getMachineFunction(); 770 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 771 return LowerGlobalAddress(MFI, Op, DAG); 772 } 773 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 774 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 775 } 776 return SDValue(); 777} 778 779/// \brief Helper function for LowerBRCOND 780static SDNode *findUser(SDValue Value, unsigned Opcode) { 781 782 SDNode *Parent = Value.getNode(); 783 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 784 I != E; ++I) { 785 786 if (I.getUse().get() != Value) 787 continue; 788 789 if (I->getOpcode() == Opcode) 790 return *I; 791 } 792 return nullptr; 793} 794 795SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 796 797 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 798 unsigned FrameIndex = FINode->getIndex(); 799 800 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 801} 802 803/// This transforms the control flow intrinsics to get the branch destination as 804/// last parameter, also switches branch target with BR if the need arise 805SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 806 SelectionDAG &DAG) const { 807 808 SDLoc DL(BRCOND); 809 810 SDNode *Intr = BRCOND.getOperand(1).getNode(); 811 SDValue Target = BRCOND.getOperand(2); 812 SDNode *BR = nullptr; 813 814 if (Intr->getOpcode() == ISD::SETCC) { 815 // As long as we negate the condition everything is fine 816 SDNode *SetCC = Intr; 817 assert(SetCC->getConstantOperandVal(1) == 1); 818 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 819 ISD::SETNE); 820 Intr = SetCC->getOperand(0).getNode(); 821 822 } else { 823 // Get the target from BR if we don't negate the condition 824 BR = findUser(BRCOND, ISD::BR); 825 Target = BR->getOperand(1); 826 } 827 828 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 829 830 // Build the result and 831 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 832 833 // operands of the new intrinsic call 834 SmallVector<SDValue, 4> Ops; 835 Ops.push_back(BRCOND.getOperand(0)); 836 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 837 Ops.push_back(Target); 838 839 // build the new intrinsic call 840 SDNode *Result = DAG.getNode( 841 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 842 DAG.getVTList(Res), Ops).getNode(); 843 844 if (BR) { 845 // Give the branch instruction our target 846 SDValue Ops[] = { 847 BR->getOperand(0), 848 BRCOND.getOperand(2) 849 }; 850 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 851 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 852 BR = NewBR.getNode(); 853 } 854 855 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 856 857 // Copy the intrinsic results to registers 858 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 859 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 860 if (!CopyToReg) 861 continue; 862 863 Chain = DAG.getCopyToReg( 864 Chain, DL, 865 CopyToReg->getOperand(1), 866 SDValue(Result, i - 1), 867 SDValue()); 868 869 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 870 } 871 872 // Remove the old intrinsic from the chain 873 DAG.ReplaceAllUsesOfValueWith( 874 SDValue(Intr, Intr->getNumValues() - 1), 875 Intr->getOperand(0)); 876 877 return Chain; 878} 879 880SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 881 SDValue Op, 882 SelectionDAG &DAG) const { 883 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 884 885 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 886 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 887 888 SDLoc DL(GSD); 889 const GlobalValue *GV = GSD->getGlobal(); 890 MVT PtrVT = getPointerTy(GSD->getAddressSpace()); 891 892 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); 893 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 894 895 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 896 DAG.getConstant(0, DL, MVT::i32)); 897 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 898 DAG.getConstant(1, DL, MVT::i32)); 899 900 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), 901 PtrLo, GA); 902 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), 903 PtrHi, DAG.getConstant(0, DL, MVT::i32), 904 SDValue(Lo.getNode(), 1)); 905 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); 906} 907 908SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, 909 SDValue V) const { 910 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 911 // so we will end up with redundant moves to m0. 912 // 913 // We can't use S_MOV_B32, because there is no way to specify m0 as the 914 // destination register. 915 // 916 // We have to use them both. Machine cse will combine all the S_MOV_B32 917 // instructions and the register coalescer eliminate the extra copies. 918 SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); 919 return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), 920 SDValue(M0, 0), SDValue()); // Glue 921 // A Null SDValue creates 922 // a glue result. 923} 924 925SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 926 SelectionDAG &DAG) const { 927 MachineFunction &MF = DAG.getMachineFunction(); 928 const SIRegisterInfo *TRI = 929 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 930 931 EVT VT = Op.getValueType(); 932 SDLoc DL(Op); 933 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 934 935 switch (IntrinsicID) { 936 case Intrinsic::r600_read_ngroups_x: 937 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 938 SI::KernelInputOffsets::NGROUPS_X, false); 939 case Intrinsic::r600_read_ngroups_y: 940 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 941 SI::KernelInputOffsets::NGROUPS_Y, false); 942 case Intrinsic::r600_read_ngroups_z: 943 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 944 SI::KernelInputOffsets::NGROUPS_Z, false); 945 case Intrinsic::r600_read_global_size_x: 946 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 947 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 948 case Intrinsic::r600_read_global_size_y: 949 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 950 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 951 case Intrinsic::r600_read_global_size_z: 952 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 953 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 954 case Intrinsic::r600_read_local_size_x: 955 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 956 SI::KernelInputOffsets::LOCAL_SIZE_X, false); 957 case Intrinsic::r600_read_local_size_y: 958 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 959 SI::KernelInputOffsets::LOCAL_SIZE_Y, false); 960 case Intrinsic::r600_read_local_size_z: 961 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 962 SI::KernelInputOffsets::LOCAL_SIZE_Z, false); 963 964 case Intrinsic::AMDGPU_read_workdim: 965 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 966 MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, 967 false); 968 969 case Intrinsic::r600_read_tgid_x: 970 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 971 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); 972 case Intrinsic::r600_read_tgid_y: 973 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 974 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); 975 case Intrinsic::r600_read_tgid_z: 976 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 977 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); 978 case Intrinsic::r600_read_tidig_x: 979 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 980 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); 981 case Intrinsic::r600_read_tidig_y: 982 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 983 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); 984 case Intrinsic::r600_read_tidig_z: 985 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 986 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); 987 case AMDGPUIntrinsic::SI_load_const: { 988 SDValue Ops[] = { 989 Op.getOperand(1), 990 Op.getOperand(2) 991 }; 992 993 MachineMemOperand *MMO = MF.getMachineMemOperand( 994 MachinePointerInfo(), 995 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 996 VT.getStoreSize(), 4); 997 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 998 Op->getVTList(), Ops, VT, MMO); 999 } 1000 case AMDGPUIntrinsic::SI_sample: 1001 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 1002 case AMDGPUIntrinsic::SI_sampleb: 1003 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 1004 case AMDGPUIntrinsic::SI_sampled: 1005 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 1006 case AMDGPUIntrinsic::SI_samplel: 1007 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 1008 case AMDGPUIntrinsic::SI_vs_load_input: 1009 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 1010 Op.getOperand(1), 1011 Op.getOperand(2), 1012 Op.getOperand(3)); 1013 1014 case AMDGPUIntrinsic::AMDGPU_fract: 1015 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 1016 return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), 1017 DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); 1018 case AMDGPUIntrinsic::SI_fs_constant: { 1019 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1020 SDValue Glue = M0.getValue(1); 1021 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 1022 DAG.getConstant(2, DL, MVT::i32), // P0 1023 Op.getOperand(1), Op.getOperand(2), Glue); 1024 } 1025 case AMDGPUIntrinsic::SI_fs_interp: { 1026 SDValue IJ = Op.getOperand(4); 1027 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1028 DAG.getConstant(0, DL, MVT::i32)); 1029 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1030 DAG.getConstant(1, DL, MVT::i32)); 1031 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1032 SDValue Glue = M0.getValue(1); 1033 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 1034 DAG.getVTList(MVT::f32, MVT::Glue), 1035 I, Op.getOperand(1), Op.getOperand(2), Glue); 1036 Glue = SDValue(P1.getNode(), 1); 1037 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 1038 Op.getOperand(1), Op.getOperand(2), Glue); 1039 } 1040 default: 1041 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1042 } 1043} 1044 1045SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 1046 SelectionDAG &DAG) const { 1047 MachineFunction &MF = DAG.getMachineFunction(); 1048 SDLoc DL(Op); 1049 SDValue Chain = Op.getOperand(0); 1050 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1051 1052 switch (IntrinsicID) { 1053 case AMDGPUIntrinsic::SI_sendmsg: { 1054 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 1055 SDValue Glue = Chain.getValue(1); 1056 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 1057 Op.getOperand(2), Glue); 1058 } 1059 case AMDGPUIntrinsic::SI_tbuffer_store: { 1060 SDValue Ops[] = { 1061 Chain, 1062 Op.getOperand(2), 1063 Op.getOperand(3), 1064 Op.getOperand(4), 1065 Op.getOperand(5), 1066 Op.getOperand(6), 1067 Op.getOperand(7), 1068 Op.getOperand(8), 1069 Op.getOperand(9), 1070 Op.getOperand(10), 1071 Op.getOperand(11), 1072 Op.getOperand(12), 1073 Op.getOperand(13), 1074 Op.getOperand(14) 1075 }; 1076 1077 EVT VT = Op.getOperand(3).getValueType(); 1078 1079 MachineMemOperand *MMO = MF.getMachineMemOperand( 1080 MachinePointerInfo(), 1081 MachineMemOperand::MOStore, 1082 VT.getStoreSize(), 4); 1083 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 1084 Op->getVTList(), Ops, VT, MMO); 1085 } 1086 default: 1087 return SDValue(); 1088 } 1089} 1090 1091SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1092 SDLoc DL(Op); 1093 LoadSDNode *Load = cast<LoadSDNode>(Op); 1094 1095 if (Op.getValueType().isVector()) { 1096 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 1097 "Custom lowering for non-i32 vectors hasn't been implemented."); 1098 unsigned NumElements = Op.getValueType().getVectorNumElements(); 1099 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 1100 switch (Load->getAddressSpace()) { 1101 default: break; 1102 case AMDGPUAS::GLOBAL_ADDRESS: 1103 case AMDGPUAS::PRIVATE_ADDRESS: 1104 // v4 loads are supported for private and global memory. 1105 if (NumElements <= 4) 1106 break; 1107 // fall-through 1108 case AMDGPUAS::LOCAL_ADDRESS: 1109 return ScalarizeVectorLoad(Op, DAG); 1110 } 1111 } 1112 1113 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1114} 1115 1116SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 1117 const SDValue &Op, 1118 SelectionDAG &DAG) const { 1119 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 1120 Op.getOperand(2), 1121 Op.getOperand(3), 1122 Op.getOperand(4)); 1123} 1124 1125SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 1126 if (Op.getValueType() != MVT::i64) 1127 return SDValue(); 1128 1129 SDLoc DL(Op); 1130 SDValue Cond = Op.getOperand(0); 1131 1132 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 1133 SDValue One = DAG.getConstant(1, DL, MVT::i32); 1134 1135 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 1136 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 1137 1138 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 1139 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 1140 1141 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 1142 1143 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 1144 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 1145 1146 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 1147 1148 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 1149 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 1150} 1151 1152// Catch division cases where we can use shortcuts with rcp and rsq 1153// instructions. 1154SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 1155 SDLoc SL(Op); 1156 SDValue LHS = Op.getOperand(0); 1157 SDValue RHS = Op.getOperand(1); 1158 EVT VT = Op.getValueType(); 1159 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 1160 1161 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 1162 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 1163 CLHS->isExactlyValue(1.0)) { 1164 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 1165 // the CI documentation has a worst case error of 1 ulp. 1166 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 1167 // use it as long as we aren't trying to use denormals. 1168 1169 // 1.0 / sqrt(x) -> rsq(x) 1170 // 1171 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 1172 // error seems really high at 2^29 ULP. 1173 if (RHS.getOpcode() == ISD::FSQRT) 1174 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 1175 1176 // 1.0 / x -> rcp(x) 1177 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1178 } 1179 } 1180 1181 if (Unsafe) { 1182 // Turn into multiply by the reciprocal. 1183 // x / y -> x * (1.0 / y) 1184 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1185 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); 1186 } 1187 1188 return SDValue(); 1189} 1190 1191SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 1192 SDValue FastLowered = LowerFastFDIV(Op, DAG); 1193 if (FastLowered.getNode()) 1194 return FastLowered; 1195 1196 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 1197 // selection error for now rather than do something incorrect. 1198 if (Subtarget->hasFP32Denormals()) 1199 return SDValue(); 1200 1201 SDLoc SL(Op); 1202 SDValue LHS = Op.getOperand(0); 1203 SDValue RHS = Op.getOperand(1); 1204 1205 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 1206 1207 const APFloat K0Val(BitsToFloat(0x6f800000)); 1208 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 1209 1210 const APFloat K1Val(BitsToFloat(0x2f800000)); 1211 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 1212 1213 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 1214 1215 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); 1216 1217 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 1218 1219 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 1220 1221 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 1222 1223 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 1224 1225 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 1226 1227 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 1228} 1229 1230SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 1231 if (DAG.getTarget().Options.UnsafeFPMath) 1232 return LowerFastFDIV(Op, DAG); 1233 1234 SDLoc SL(Op); 1235 SDValue X = Op.getOperand(0); 1236 SDValue Y = Op.getOperand(1); 1237 1238 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 1239 1240 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 1241 1242 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 1243 1244 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 1245 1246 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 1247 1248 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 1249 1250 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 1251 1252 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 1253 1254 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 1255 1256 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 1257 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 1258 1259 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 1260 NegDivScale0, Mul, DivScale1); 1261 1262 SDValue Scale; 1263 1264 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1265 // Workaround a hardware bug on SI where the condition output from div_scale 1266 // is not usable. 1267 1268 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 1269 1270 // Figure out if the scale to use for div_fmas. 1271 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 1272 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 1273 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 1274 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 1275 1276 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 1277 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 1278 1279 SDValue Scale0Hi 1280 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 1281 SDValue Scale1Hi 1282 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 1283 1284 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 1285 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 1286 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 1287 } else { 1288 Scale = DivScale1.getValue(1); 1289 } 1290 1291 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 1292 Fma4, Fma3, Mul, Scale); 1293 1294 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 1295} 1296 1297SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 1298 EVT VT = Op.getValueType(); 1299 1300 if (VT == MVT::f32) 1301 return LowerFDIV32(Op, DAG); 1302 1303 if (VT == MVT::f64) 1304 return LowerFDIV64(Op, DAG); 1305 1306 llvm_unreachable("Unexpected type for fdiv"); 1307} 1308 1309SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1310 SDLoc DL(Op); 1311 StoreSDNode *Store = cast<StoreSDNode>(Op); 1312 EVT VT = Store->getMemoryVT(); 1313 1314 // These stores are legal. 1315 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1316 if (VT.isVector() && VT.getVectorNumElements() > 4) 1317 return ScalarizeVectorStore(Op, DAG); 1318 return SDValue(); 1319 } 1320 1321 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1322 if (Ret.getNode()) 1323 return Ret; 1324 1325 if (VT.isVector() && VT.getVectorNumElements() >= 8) 1326 return ScalarizeVectorStore(Op, DAG); 1327 1328 if (VT == MVT::i1) 1329 return DAG.getTruncStore(Store->getChain(), DL, 1330 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 1331 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 1332 1333 return SDValue(); 1334} 1335 1336SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1337 SDLoc DL(Op); 1338 EVT VT = Op.getValueType(); 1339 SDValue Arg = Op.getOperand(0); 1340 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 1341 DAG.getNode(ISD::FMUL, DL, VT, Arg, 1342 DAG.getConstantFP(0.5/M_PI, DL, 1343 VT))); 1344 1345 switch (Op.getOpcode()) { 1346 case ISD::FCOS: 1347 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 1348 case ISD::FSIN: 1349 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 1350 default: 1351 llvm_unreachable("Wrong trig opcode"); 1352 } 1353} 1354 1355//===----------------------------------------------------------------------===// 1356// Custom DAG optimizations 1357//===----------------------------------------------------------------------===// 1358 1359SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1360 DAGCombinerInfo &DCI) const { 1361 EVT VT = N->getValueType(0); 1362 EVT ScalarVT = VT.getScalarType(); 1363 if (ScalarVT != MVT::f32) 1364 return SDValue(); 1365 1366 SelectionDAG &DAG = DCI.DAG; 1367 SDLoc DL(N); 1368 1369 SDValue Src = N->getOperand(0); 1370 EVT SrcVT = Src.getValueType(); 1371 1372 // TODO: We could try to match extracting the higher bytes, which would be 1373 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1374 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1375 // about in practice. 1376 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1377 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1378 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1379 DCI.AddToWorklist(Cvt.getNode()); 1380 return Cvt; 1381 } 1382 } 1383 1384 // We are primarily trying to catch operations on illegal vector types 1385 // before they are expanded. 1386 // For scalars, we can use the more flexible method of checking masked bits 1387 // after legalization. 1388 if (!DCI.isBeforeLegalize() || 1389 !SrcVT.isVector() || 1390 SrcVT.getVectorElementType() != MVT::i8) { 1391 return SDValue(); 1392 } 1393 1394 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1395 1396 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1397 // size as 4. 1398 unsigned NElts = SrcVT.getVectorNumElements(); 1399 if (!SrcVT.isSimple() && NElts != 3) 1400 return SDValue(); 1401 1402 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1403 // prevent a mess from expanding to v4i32 and repacking. 1404 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1405 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1406 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1407 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1408 LoadSDNode *Load = cast<LoadSDNode>(Src); 1409 1410 unsigned AS = Load->getAddressSpace(); 1411 unsigned Align = Load->getAlignment(); 1412 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); 1413 unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); 1414 1415 // Don't try to replace the load if we have to expand it due to alignment 1416 // problems. Otherwise we will end up scalarizing the load, and trying to 1417 // repack into the vector for no real reason. 1418 if (Align < ABIAlignment && 1419 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { 1420 return SDValue(); 1421 } 1422 1423 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1424 Load->getChain(), 1425 Load->getBasePtr(), 1426 LoadVT, 1427 Load->getMemOperand()); 1428 1429 // Make sure successors of the original load stay after it by updating 1430 // them to use the new Chain. 1431 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1432 1433 SmallVector<SDValue, 4> Elts; 1434 if (RegVT.isVector()) 1435 DAG.ExtractVectorElements(NewLoad, Elts); 1436 else 1437 Elts.push_back(NewLoad); 1438 1439 SmallVector<SDValue, 4> Ops; 1440 1441 unsigned EltIdx = 0; 1442 for (SDValue Elt : Elts) { 1443 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1444 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1445 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1446 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1447 DCI.AddToWorklist(Cvt.getNode()); 1448 Ops.push_back(Cvt); 1449 } 1450 1451 ++EltIdx; 1452 } 1453 1454 assert(Ops.size() == NElts); 1455 1456 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1457 } 1458 1459 return SDValue(); 1460} 1461 1462/// \brief Return true if the given offset Size in bytes can be folded into 1463/// the immediate offsets of a memory instruction for the given address space. 1464static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 1465 const AMDGPUSubtarget &STI) { 1466 switch (AS) { 1467 case AMDGPUAS::GLOBAL_ADDRESS: { 1468 // MUBUF instructions a 12-bit offset in bytes. 1469 return isUInt<12>(OffsetSize); 1470 } 1471 case AMDGPUAS::CONSTANT_ADDRESS: { 1472 // SMRD instructions have an 8-bit offset in dwords on SI and 1473 // a 20-bit offset in bytes on VI. 1474 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1475 return isUInt<20>(OffsetSize); 1476 else 1477 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 1478 } 1479 case AMDGPUAS::LOCAL_ADDRESS: 1480 case AMDGPUAS::REGION_ADDRESS: { 1481 // The single offset versions have a 16-bit offset in bytes. 1482 return isUInt<16>(OffsetSize); 1483 } 1484 case AMDGPUAS::PRIVATE_ADDRESS: 1485 // Indirect register addressing does not use any offsets. 1486 default: 1487 return 0; 1488 } 1489} 1490 1491// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 1492 1493// This is a variant of 1494// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 1495// 1496// The normal DAG combiner will do this, but only if the add has one use since 1497// that would increase the number of instructions. 1498// 1499// This prevents us from seeing a constant offset that can be folded into a 1500// memory instruction's addressing mode. If we know the resulting add offset of 1501// a pointer can be folded into an addressing offset, we can replace the pointer 1502// operand with the add of new constant offset. This eliminates one of the uses, 1503// and may allow the remaining use to also be simplified. 1504// 1505SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 1506 unsigned AddrSpace, 1507 DAGCombinerInfo &DCI) const { 1508 SDValue N0 = N->getOperand(0); 1509 SDValue N1 = N->getOperand(1); 1510 1511 if (N0.getOpcode() != ISD::ADD) 1512 return SDValue(); 1513 1514 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 1515 if (!CN1) 1516 return SDValue(); 1517 1518 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1519 if (!CAdd) 1520 return SDValue(); 1521 1522 // If the resulting offset is too large, we can't fold it into the addressing 1523 // mode offset. 1524 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 1525 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) 1526 return SDValue(); 1527 1528 SelectionDAG &DAG = DCI.DAG; 1529 SDLoc SL(N); 1530 EVT VT = N->getValueType(0); 1531 1532 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 1533 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 1534 1535 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 1536} 1537 1538SDValue SITargetLowering::performAndCombine(SDNode *N, 1539 DAGCombinerInfo &DCI) const { 1540 if (DCI.isBeforeLegalize()) 1541 return SDValue(); 1542 1543 SelectionDAG &DAG = DCI.DAG; 1544 1545 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 1546 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 1547 SDValue LHS = N->getOperand(0); 1548 SDValue RHS = N->getOperand(1); 1549 1550 if (LHS.getOpcode() == ISD::SETCC && 1551 RHS.getOpcode() == ISD::SETCC) { 1552 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 1553 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 1554 1555 SDValue X = LHS.getOperand(0); 1556 SDValue Y = RHS.getOperand(0); 1557 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 1558 return SDValue(); 1559 1560 if (LCC == ISD::SETO) { 1561 if (X != LHS.getOperand(1)) 1562 return SDValue(); 1563 1564 if (RCC == ISD::SETUNE) { 1565 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 1566 if (!C1 || !C1->isInfinity() || C1->isNegative()) 1567 return SDValue(); 1568 1569 const uint32_t Mask = SIInstrFlags::N_NORMAL | 1570 SIInstrFlags::N_SUBNORMAL | 1571 SIInstrFlags::N_ZERO | 1572 SIInstrFlags::P_ZERO | 1573 SIInstrFlags::P_SUBNORMAL | 1574 SIInstrFlags::P_NORMAL; 1575 1576 static_assert(((~(SIInstrFlags::S_NAN | 1577 SIInstrFlags::Q_NAN | 1578 SIInstrFlags::N_INFINITY | 1579 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 1580 "mask not equal"); 1581 1582 SDLoc DL(N); 1583 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1584 X, DAG.getConstant(Mask, DL, MVT::i32)); 1585 } 1586 } 1587 } 1588 1589 return SDValue(); 1590} 1591 1592SDValue SITargetLowering::performOrCombine(SDNode *N, 1593 DAGCombinerInfo &DCI) const { 1594 SelectionDAG &DAG = DCI.DAG; 1595 SDValue LHS = N->getOperand(0); 1596 SDValue RHS = N->getOperand(1); 1597 1598 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 1599 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 1600 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 1601 SDValue Src = LHS.getOperand(0); 1602 if (Src != RHS.getOperand(0)) 1603 return SDValue(); 1604 1605 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1606 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 1607 if (!CLHS || !CRHS) 1608 return SDValue(); 1609 1610 // Only 10 bits are used. 1611 static const uint32_t MaxMask = 0x3ff; 1612 1613 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 1614 SDLoc DL(N); 1615 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1616 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 1617 } 1618 1619 return SDValue(); 1620} 1621 1622SDValue SITargetLowering::performClassCombine(SDNode *N, 1623 DAGCombinerInfo &DCI) const { 1624 SelectionDAG &DAG = DCI.DAG; 1625 SDValue Mask = N->getOperand(1); 1626 1627 // fp_class x, 0 -> false 1628 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 1629 if (CMask->isNullValue()) 1630 return DAG.getConstant(0, SDLoc(N), MVT::i1); 1631 } 1632 1633 return SDValue(); 1634} 1635 1636static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 1637 switch (Opc) { 1638 case ISD::FMAXNUM: 1639 return AMDGPUISD::FMAX3; 1640 case ISD::SMAX: 1641 return AMDGPUISD::SMAX3; 1642 case ISD::UMAX: 1643 return AMDGPUISD::UMAX3; 1644 case ISD::FMINNUM: 1645 return AMDGPUISD::FMIN3; 1646 case ISD::SMIN: 1647 return AMDGPUISD::SMIN3; 1648 case ISD::UMIN: 1649 return AMDGPUISD::UMIN3; 1650 default: 1651 llvm_unreachable("Not a min/max opcode"); 1652 } 1653} 1654 1655SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, 1656 DAGCombinerInfo &DCI) const { 1657 SelectionDAG &DAG = DCI.DAG; 1658 1659 unsigned Opc = N->getOpcode(); 1660 SDValue Op0 = N->getOperand(0); 1661 SDValue Op1 = N->getOperand(1); 1662 1663 // Only do this if the inner op has one use since this will just increases 1664 // register pressure for no benefit. 1665 1666 // max(max(a, b), c) 1667 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 1668 SDLoc DL(N); 1669 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1670 DL, 1671 N->getValueType(0), 1672 Op0.getOperand(0), 1673 Op0.getOperand(1), 1674 Op1); 1675 } 1676 1677 // max(a, max(b, c)) 1678 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 1679 SDLoc DL(N); 1680 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1681 DL, 1682 N->getValueType(0), 1683 Op0, 1684 Op1.getOperand(0), 1685 Op1.getOperand(1)); 1686 } 1687 1688 return SDValue(); 1689} 1690 1691SDValue SITargetLowering::performSetCCCombine(SDNode *N, 1692 DAGCombinerInfo &DCI) const { 1693 SelectionDAG &DAG = DCI.DAG; 1694 SDLoc SL(N); 1695 1696 SDValue LHS = N->getOperand(0); 1697 SDValue RHS = N->getOperand(1); 1698 EVT VT = LHS.getValueType(); 1699 1700 if (VT != MVT::f32 && VT != MVT::f64) 1701 return SDValue(); 1702 1703 // Match isinf pattern 1704 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 1705 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 1706 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 1707 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1708 if (!CRHS) 1709 return SDValue(); 1710 1711 const APFloat &APF = CRHS->getValueAPF(); 1712 if (APF.isInfinity() && !APF.isNegative()) { 1713 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 1714 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 1715 DAG.getConstant(Mask, SL, MVT::i32)); 1716 } 1717 } 1718 1719 return SDValue(); 1720} 1721 1722SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 1723 DAGCombinerInfo &DCI) const { 1724 SelectionDAG &DAG = DCI.DAG; 1725 SDLoc DL(N); 1726 1727 switch (N->getOpcode()) { 1728 default: 1729 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1730 case ISD::SETCC: 1731 return performSetCCCombine(N, DCI); 1732 case ISD::FMAXNUM: // TODO: What about fmax_legacy? 1733 case ISD::FMINNUM: 1734 case ISD::SMAX: 1735 case ISD::SMIN: 1736 case ISD::UMAX: 1737 case ISD::UMIN: { 1738 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 1739 N->getValueType(0) != MVT::f64 && 1740 getTargetMachine().getOptLevel() > CodeGenOpt::None) 1741 return performMin3Max3Combine(N, DCI); 1742 break; 1743 } 1744 1745 case AMDGPUISD::CVT_F32_UBYTE0: 1746 case AMDGPUISD::CVT_F32_UBYTE1: 1747 case AMDGPUISD::CVT_F32_UBYTE2: 1748 case AMDGPUISD::CVT_F32_UBYTE3: { 1749 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 1750 1751 SDValue Src = N->getOperand(0); 1752 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 1753 1754 APInt KnownZero, KnownOne; 1755 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1756 !DCI.isBeforeLegalizeOps()); 1757 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1758 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 1759 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 1760 DCI.CommitTargetLoweringOpt(TLO); 1761 } 1762 1763 break; 1764 } 1765 1766 case ISD::UINT_TO_FP: { 1767 return performUCharToFloatCombine(N, DCI); 1768 1769 case ISD::FADD: { 1770 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1771 break; 1772 1773 EVT VT = N->getValueType(0); 1774 if (VT != MVT::f32) 1775 break; 1776 1777 // Only do this if we are not trying to support denormals. v_mad_f32 does 1778 // not support denormals ever. 1779 if (Subtarget->hasFP32Denormals()) 1780 break; 1781 1782 SDValue LHS = N->getOperand(0); 1783 SDValue RHS = N->getOperand(1); 1784 1785 // These should really be instruction patterns, but writing patterns with 1786 // source modiifiers is a pain. 1787 1788 // fadd (fadd (a, a), b) -> mad 2.0, a, b 1789 if (LHS.getOpcode() == ISD::FADD) { 1790 SDValue A = LHS.getOperand(0); 1791 if (A == LHS.getOperand(1)) { 1792 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1793 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 1794 } 1795 } 1796 1797 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 1798 if (RHS.getOpcode() == ISD::FADD) { 1799 SDValue A = RHS.getOperand(0); 1800 if (A == RHS.getOperand(1)) { 1801 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1802 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 1803 } 1804 } 1805 1806 return SDValue(); 1807 } 1808 case ISD::FSUB: { 1809 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1810 break; 1811 1812 EVT VT = N->getValueType(0); 1813 1814 // Try to get the fneg to fold into the source modifier. This undoes generic 1815 // DAG combines and folds them into the mad. 1816 // 1817 // Only do this if we are not trying to support denormals. v_mad_f32 does 1818 // not support denormals ever. 1819 if (VT == MVT::f32 && 1820 !Subtarget->hasFP32Denormals()) { 1821 SDValue LHS = N->getOperand(0); 1822 SDValue RHS = N->getOperand(1); 1823 if (LHS.getOpcode() == ISD::FADD) { 1824 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 1825 1826 SDValue A = LHS.getOperand(0); 1827 if (A == LHS.getOperand(1)) { 1828 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1829 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 1830 1831 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 1832 } 1833 } 1834 1835 if (RHS.getOpcode() == ISD::FADD) { 1836 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 1837 1838 SDValue A = RHS.getOperand(0); 1839 if (A == RHS.getOperand(1)) { 1840 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 1841 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 1842 } 1843 } 1844 1845 return SDValue(); 1846 } 1847 1848 break; 1849 } 1850 } 1851 case ISD::LOAD: 1852 case ISD::STORE: 1853 case ISD::ATOMIC_LOAD: 1854 case ISD::ATOMIC_STORE: 1855 case ISD::ATOMIC_CMP_SWAP: 1856 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 1857 case ISD::ATOMIC_SWAP: 1858 case ISD::ATOMIC_LOAD_ADD: 1859 case ISD::ATOMIC_LOAD_SUB: 1860 case ISD::ATOMIC_LOAD_AND: 1861 case ISD::ATOMIC_LOAD_OR: 1862 case ISD::ATOMIC_LOAD_XOR: 1863 case ISD::ATOMIC_LOAD_NAND: 1864 case ISD::ATOMIC_LOAD_MIN: 1865 case ISD::ATOMIC_LOAD_MAX: 1866 case ISD::ATOMIC_LOAD_UMIN: 1867 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 1868 if (DCI.isBeforeLegalize()) 1869 break; 1870 1871 MemSDNode *MemNode = cast<MemSDNode>(N); 1872 SDValue Ptr = MemNode->getBasePtr(); 1873 1874 // TODO: We could also do this for multiplies. 1875 unsigned AS = MemNode->getAddressSpace(); 1876 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 1877 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 1878 if (NewPtr) { 1879 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 1880 1881 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 1882 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 1883 } 1884 } 1885 break; 1886 } 1887 case ISD::AND: 1888 return performAndCombine(N, DCI); 1889 case ISD::OR: 1890 return performOrCombine(N, DCI); 1891 case AMDGPUISD::FP_CLASS: 1892 return performClassCombine(N, DCI); 1893 } 1894 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1895} 1896 1897/// \brief Analyze the possible immediate value Op 1898/// 1899/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1900/// and the immediate value if it's a literal immediate 1901int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1902 1903 const SIInstrInfo *TII = 1904 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1905 1906 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1907 if (TII->isInlineConstant(Node->getAPIntValue())) 1908 return 0; 1909 1910 uint64_t Val = Node->getZExtValue(); 1911 return isUInt<32>(Val) ? Val : -1; 1912 } 1913 1914 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 1915 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 1916 return 0; 1917 1918 if (Node->getValueType(0) == MVT::f32) 1919 return FloatToBits(Node->getValueAPF().convertToFloat()); 1920 1921 return -1; 1922 } 1923 1924 return -1; 1925} 1926 1927/// \brief Helper function for adjustWritemask 1928static unsigned SubIdx2Lane(unsigned Idx) { 1929 switch (Idx) { 1930 default: return 0; 1931 case AMDGPU::sub0: return 0; 1932 case AMDGPU::sub1: return 1; 1933 case AMDGPU::sub2: return 2; 1934 case AMDGPU::sub3: return 3; 1935 } 1936} 1937 1938/// \brief Adjust the writemask of MIMG instructions 1939void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1940 SelectionDAG &DAG) const { 1941 SDNode *Users[4] = { }; 1942 unsigned Lane = 0; 1943 unsigned OldDmask = Node->getConstantOperandVal(0); 1944 unsigned NewDmask = 0; 1945 1946 // Try to figure out the used register components 1947 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1948 I != E; ++I) { 1949 1950 // Abort if we can't understand the usage 1951 if (!I->isMachineOpcode() || 1952 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1953 return; 1954 1955 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1956 // Note that subregs are packed, i.e. Lane==0 is the first bit set 1957 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1958 // set, etc. 1959 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1960 1961 // Set which texture component corresponds to the lane. 1962 unsigned Comp; 1963 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1964 assert(Dmask); 1965 Comp = countTrailingZeros(Dmask); 1966 Dmask &= ~(1 << Comp); 1967 } 1968 1969 // Abort if we have more than one user per component 1970 if (Users[Lane]) 1971 return; 1972 1973 Users[Lane] = *I; 1974 NewDmask |= 1 << Comp; 1975 } 1976 1977 // Abort if there's no change 1978 if (NewDmask == OldDmask) 1979 return; 1980 1981 // Adjust the writemask in the node 1982 std::vector<SDValue> Ops; 1983 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 1984 Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); 1985 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 1986 1987 // If we only got one lane, replace it with a copy 1988 // (if NewDmask has only one bit set...) 1989 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1990 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 1991 MVT::i32); 1992 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1993 SDLoc(), Users[Lane]->getValueType(0), 1994 SDValue(Node, 0), RC); 1995 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1996 return; 1997 } 1998 1999 // Update the users of the node with the new indices 2000 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 2001 2002 SDNode *User = Users[i]; 2003 if (!User) 2004 continue; 2005 2006 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 2007 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 2008 2009 switch (Idx) { 2010 default: break; 2011 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 2012 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 2013 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 2014 } 2015 } 2016} 2017 2018/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 2019/// with frame index operands. 2020/// LLVM assumes that inputs are to these instructions are registers. 2021void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 2022 SelectionDAG &DAG) const { 2023 2024 SmallVector<SDValue, 8> Ops; 2025 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 2026 if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { 2027 Ops.push_back(Node->getOperand(i)); 2028 continue; 2029 } 2030 2031 SDLoc DL(Node); 2032 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 2033 Node->getOperand(i).getValueType(), 2034 Node->getOperand(i)), 0)); 2035 } 2036 2037 DAG.UpdateNodeOperands(Node, Ops); 2038} 2039 2040/// \brief Fold the instructions after selecting them. 2041SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 2042 SelectionDAG &DAG) const { 2043 const SIInstrInfo *TII = 2044 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2045 2046 if (TII->isMIMG(Node->getMachineOpcode())) 2047 adjustWritemask(Node, DAG); 2048 2049 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || 2050 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { 2051 legalizeTargetIndependentNode(Node, DAG); 2052 return Node; 2053 } 2054 return Node; 2055} 2056 2057/// \brief Assign the register class depending on the number of 2058/// bits set in the writemask 2059void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 2060 SDNode *Node) const { 2061 const SIInstrInfo *TII = 2062 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2063 2064 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2065 TII->legalizeOperands(MI); 2066 2067 if (TII->isMIMG(MI->getOpcode())) { 2068 unsigned VReg = MI->getOperand(0).getReg(); 2069 unsigned Writemask = MI->getOperand(1).getImm(); 2070 unsigned BitsSet = 0; 2071 for (unsigned i = 0; i < 4; ++i) 2072 BitsSet += Writemask & (1 << i) ? 1 : 0; 2073 2074 const TargetRegisterClass *RC; 2075 switch (BitsSet) { 2076 default: return; 2077 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 2078 case 2: RC = &AMDGPU::VReg_64RegClass; break; 2079 case 3: RC = &AMDGPU::VReg_96RegClass; break; 2080 } 2081 2082 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 2083 MI->setDesc(TII->get(NewOpcode)); 2084 MRI.setRegClass(VReg, RC); 2085 return; 2086 } 2087 2088 // Replace unused atomics with the no return version. 2089 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 2090 if (NoRetAtomicOp != -1) { 2091 if (!Node->hasAnyUseOfValue(0)) { 2092 MI->setDesc(TII->get(NoRetAtomicOp)); 2093 MI->RemoveOperand(0); 2094 } 2095 2096 return; 2097 } 2098} 2099 2100static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { 2101 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 2102 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 2103} 2104 2105MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 2106 SDLoc DL, 2107 SDValue Ptr) const { 2108 const SIInstrInfo *TII = 2109 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2110#if 1 2111 // XXX - Workaround for moveToVALU not handling different register class 2112 // inserts for REG_SEQUENCE. 2113 2114 // Build the half of the subregister with the constants. 2115 const SDValue Ops0[] = { 2116 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 2117 buildSMovImm32(DAG, DL, 0), 2118 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2119 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 2120 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 2121 }; 2122 2123 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 2124 MVT::v2i32, Ops0), 0); 2125 2126 // Combine the constants and the pointer. 2127 const SDValue Ops1[] = { 2128 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2129 Ptr, 2130 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 2131 SubRegHi, 2132 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 2133 }; 2134 2135 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 2136#else 2137 const SDValue Ops[] = { 2138 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), 2139 Ptr, 2140 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), 2141 buildSMovImm32(DAG, DL, 0), 2142 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), 2143 buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), 2144 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) 2145 }; 2146 2147 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2148 2149#endif 2150} 2151 2152/// \brief Return a resource descriptor with the 'Add TID' bit enabled 2153/// The TID (Thread ID) is multipled by the stride value (bits [61:48] 2154/// of the resource descriptor) to create an offset, which is added to the 2155/// resource ponter. 2156MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, 2157 SDLoc DL, 2158 SDValue Ptr, 2159 uint32_t RsrcDword1, 2160 uint64_t RsrcDword2And3) const { 2161 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 2162 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 2163 if (RsrcDword1) { 2164 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 2165 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 2166 0); 2167 } 2168 2169 SDValue DataLo = buildSMovImm32(DAG, DL, 2170 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 2171 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 2172 2173 const SDValue Ops[] = { 2174 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2175 PtrLo, 2176 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2177 PtrHi, 2178 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 2179 DataLo, 2180 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 2181 DataHi, 2182 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 2183 }; 2184 2185 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2186} 2187 2188MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, 2189 SDLoc DL, 2190 SDValue Ptr) const { 2191 const SIInstrInfo *TII = 2192 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2193 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 2194 0xffffffff; // Size 2195 2196 return buildRSRC(DAG, DL, Ptr, 0, Rsrc); 2197} 2198 2199SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2200 const TargetRegisterClass *RC, 2201 unsigned Reg, EVT VT) const { 2202 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 2203 2204 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 2205 cast<RegisterSDNode>(VReg)->getReg(), VT); 2206} 2207 2208//===----------------------------------------------------------------------===// 2209// SI Inline Assembly Support 2210//===----------------------------------------------------------------------===// 2211 2212std::pair<unsigned, const TargetRegisterClass *> 2213SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 2214 const std::string &Constraint, 2215 MVT VT) const { 2216 if (Constraint == "r") { 2217 switch(VT.SimpleTy) { 2218 default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); 2219 case MVT::i64: 2220 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 2221 case MVT::i32: 2222 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 2223 } 2224 } 2225 2226 if (Constraint.size() > 1) { 2227 const TargetRegisterClass *RC = nullptr; 2228 if (Constraint[1] == 'v') { 2229 RC = &AMDGPU::VGPR_32RegClass; 2230 } else if (Constraint[1] == 's') { 2231 RC = &AMDGPU::SGPR_32RegClass; 2232 } 2233 2234 if (RC) { 2235 unsigned Idx = std::atoi(Constraint.substr(2).c_str()); 2236 if (Idx < RC->getNumRegs()) 2237 return std::make_pair(RC->getRegister(Idx), RC); 2238 } 2239 } 2240 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 2241} 2242