1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for SI 12// 13//===----------------------------------------------------------------------===// 14 15#ifdef _MSC_VER 16// Provide M_PI. 17#define _USE_MATH_DEFINES 18#include <cmath> 19#endif 20 21#include "AMDGPU.h" 22#include "AMDGPUIntrinsicInfo.h" 23#include "AMDGPUSubtarget.h" 24#include "SIISelLowering.h" 25#include "SIInstrInfo.h" 26#include "SIMachineFunctionInfo.h" 27#include "SIRegisterInfo.h" 28#include "llvm/ADT/BitVector.h" 29#include "llvm/ADT/StringSwitch.h" 30#include "llvm/CodeGen/CallingConvLower.h" 31#include "llvm/CodeGen/MachineInstrBuilder.h" 32#include "llvm/CodeGen/MachineRegisterInfo.h" 33#include "llvm/CodeGen/SelectionDAG.h" 34#include "llvm/IR/DiagnosticInfo.h" 35#include "llvm/IR/Function.h" 36 37using namespace llvm; 38 39// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv. 40static cl::opt<bool> EnableAMDGPUFastFDIV( 41 "amdgpu-fast-fdiv", 42 cl::desc("Enable faster 2.5 ulp fdiv"), 43 cl::init(false)); 44 45static unsigned findFirstFreeSGPR(CCState &CCInfo) { 46 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 47 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 48 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 49 return AMDGPU::SGPR0 + Reg; 50 } 51 } 52 llvm_unreachable("Cannot allocate sgpr"); 53} 54 55SITargetLowering::SITargetLowering(const TargetMachine &TM, 56 const SISubtarget &STI) 57 : AMDGPUTargetLowering(TM, STI) { 58 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 59 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 60 61 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 62 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 63 64 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 65 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 66 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 67 68 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 69 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 70 71 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 72 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 73 74 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 75 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 76 77 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 78 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 79 80 computeRegisterProperties(STI.getRegisterInfo()); 81 82 // We need to custom lower vector stores from local memory 83 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 84 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 87 setOperationAction(ISD::LOAD, MVT::i1, Custom); 88 89 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 90 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 91 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 92 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 93 setOperationAction(ISD::STORE, MVT::i1, Custom); 94 95 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 96 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 97 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 98 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 99 100 setOperationAction(ISD::SELECT, MVT::i1, Promote); 101 setOperationAction(ISD::SELECT, MVT::i64, Custom); 102 setOperationAction(ISD::SELECT, MVT::f64, Promote); 103 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 104 105 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 106 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 107 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 108 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 109 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 110 111 setOperationAction(ISD::SETCC, MVT::i1, Promote); 112 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 113 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 114 115 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 116 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 117 118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 122 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 124 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 125 126 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 127 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 128 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 129 130 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 131 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 132 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 133 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 134 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 135 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 136 137 // We only support LOAD/STORE and vector manipulation ops for vectors 138 // with > 4 elements. 139 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { 140 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 141 switch (Op) { 142 case ISD::LOAD: 143 case ISD::STORE: 144 case ISD::BUILD_VECTOR: 145 case ISD::BITCAST: 146 case ISD::EXTRACT_VECTOR_ELT: 147 case ISD::INSERT_VECTOR_ELT: 148 case ISD::INSERT_SUBVECTOR: 149 case ISD::EXTRACT_SUBVECTOR: 150 case ISD::SCALAR_TO_VECTOR: 151 break; 152 case ISD::CONCAT_VECTORS: 153 setOperationAction(Op, VT, Custom); 154 break; 155 default: 156 setOperationAction(Op, VT, Expand); 157 break; 158 } 159 } 160 } 161 162 // Most operations are naturally 32-bit vector operations. We only support 163 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 164 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 165 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 166 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 167 168 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 169 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 170 171 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 172 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 173 174 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 175 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 176 } 177 178 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 179 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 180 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 181 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 182 183 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 184 // and output demarshalling 185 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 186 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 187 188 // We can't return success/failure, only the old value, 189 // let LLVM add the comparison 190 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); 191 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); 192 193 if (getSubtarget()->hasFlatAddressSpace()) { 194 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); 195 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); 196 } 197 198 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 199 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 200 201 // On SI this is s_memtime and s_memrealtime on VI. 202 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 203 setOperationAction(ISD::TRAP, MVT::Other, Custom); 204 205 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 206 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 207 208 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { 209 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 210 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 211 setOperationAction(ISD::FRINT, MVT::f64, Legal); 212 } 213 214 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 215 216 setOperationAction(ISD::FSIN, MVT::f32, Custom); 217 setOperationAction(ISD::FCOS, MVT::f32, Custom); 218 setOperationAction(ISD::FDIV, MVT::f32, Custom); 219 setOperationAction(ISD::FDIV, MVT::f64, Custom); 220 221 setTargetDAGCombine(ISD::FADD); 222 setTargetDAGCombine(ISD::FSUB); 223 setTargetDAGCombine(ISD::FMINNUM); 224 setTargetDAGCombine(ISD::FMAXNUM); 225 setTargetDAGCombine(ISD::SMIN); 226 setTargetDAGCombine(ISD::SMAX); 227 setTargetDAGCombine(ISD::UMIN); 228 setTargetDAGCombine(ISD::UMAX); 229 setTargetDAGCombine(ISD::SETCC); 230 setTargetDAGCombine(ISD::AND); 231 setTargetDAGCombine(ISD::OR); 232 setTargetDAGCombine(ISD::UINT_TO_FP); 233 setTargetDAGCombine(ISD::FCANONICALIZE); 234 235 // All memory operations. Some folding on the pointer operand is done to help 236 // matching the constant offsets in the addressing modes. 237 setTargetDAGCombine(ISD::LOAD); 238 setTargetDAGCombine(ISD::STORE); 239 setTargetDAGCombine(ISD::ATOMIC_LOAD); 240 setTargetDAGCombine(ISD::ATOMIC_STORE); 241 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 242 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 243 setTargetDAGCombine(ISD::ATOMIC_SWAP); 244 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 245 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 246 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 247 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 248 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 249 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 250 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 251 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 252 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 253 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 254 255 setSchedulingPreference(Sched::RegPressure); 256} 257 258const SISubtarget *SITargetLowering::getSubtarget() const { 259 return static_cast<const SISubtarget *>(Subtarget); 260} 261 262//===----------------------------------------------------------------------===// 263// TargetLowering queries 264//===----------------------------------------------------------------------===// 265 266bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 267 const CallInst &CI, 268 unsigned IntrID) const { 269 switch (IntrID) { 270 case Intrinsic::amdgcn_atomic_inc: 271 case Intrinsic::amdgcn_atomic_dec: 272 Info.opc = ISD::INTRINSIC_W_CHAIN; 273 Info.memVT = MVT::getVT(CI.getType()); 274 Info.ptrVal = CI.getOperand(0); 275 Info.align = 0; 276 Info.vol = false; 277 Info.readMem = true; 278 Info.writeMem = true; 279 return true; 280 default: 281 return false; 282 } 283} 284 285bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 286 EVT) const { 287 // SI has some legal vector types, but no legal vector operations. Say no 288 // shuffles are legal in order to prefer scalarizing some vector operations. 289 return false; 290} 291 292bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 293 // Flat instructions do not have offsets, and only have the register 294 // address. 295 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 296} 297 298bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 299 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 300 // additionally can do r + r + i with addr64. 32-bit has more addressing 301 // mode options. Depending on the resource constant, it can also do 302 // (i64 r0) + (i32 r1) * (i14 i). 303 // 304 // Private arrays end up using a scratch buffer most of the time, so also 305 // assume those use MUBUF instructions. Scratch loads / stores are currently 306 // implemented as mubuf instructions with offen bit set, so slightly 307 // different than the normal addr64. 308 if (!isUInt<12>(AM.BaseOffs)) 309 return false; 310 311 // FIXME: Since we can split immediate into soffset and immediate offset, 312 // would it make sense to allow any immediate? 313 314 switch (AM.Scale) { 315 case 0: // r + i or just i, depending on HasBaseReg. 316 return true; 317 case 1: 318 return true; // We have r + r or r + i. 319 case 2: 320 if (AM.HasBaseReg) { 321 // Reject 2 * r + r. 322 return false; 323 } 324 325 // Allow 2 * r as r + r 326 // Or 2 * r + i is allowed as r + r + i. 327 return true; 328 default: // Don't allow n * r 329 return false; 330 } 331} 332 333bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 334 const AddrMode &AM, Type *Ty, 335 unsigned AS) const { 336 // No global is ever allowed as a base. 337 if (AM.BaseGV) 338 return false; 339 340 switch (AS) { 341 case AMDGPUAS::GLOBAL_ADDRESS: { 342 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 343 // Assume the we will use FLAT for all global memory accesses 344 // on VI. 345 // FIXME: This assumption is currently wrong. On VI we still use 346 // MUBUF instructions for the r + i addressing mode. As currently 347 // implemented, the MUBUF instructions only work on buffer < 4GB. 348 // It may be possible to support > 4GB buffers with MUBUF instructions, 349 // by setting the stride value in the resource descriptor which would 350 // increase the size limit to (stride * 4GB). However, this is risky, 351 // because it has never been validated. 352 return isLegalFlatAddressingMode(AM); 353 } 354 355 return isLegalMUBUFAddressingMode(AM); 356 } 357 case AMDGPUAS::CONSTANT_ADDRESS: { 358 // If the offset isn't a multiple of 4, it probably isn't going to be 359 // correctly aligned. 360 if (AM.BaseOffs % 4 != 0) 361 return isLegalMUBUFAddressingMode(AM); 362 363 // There are no SMRD extloads, so if we have to do a small type access we 364 // will use a MUBUF load. 365 // FIXME?: We also need to do this if unaligned, but we don't know the 366 // alignment here. 367 if (DL.getTypeStoreSize(Ty) < 4) 368 return isLegalMUBUFAddressingMode(AM); 369 370 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 371 // SMRD instructions have an 8-bit, dword offset on SI. 372 if (!isUInt<8>(AM.BaseOffs / 4)) 373 return false; 374 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { 375 // On CI+, this can also be a 32-bit literal constant offset. If it fits 376 // in 8-bits, it can use a smaller encoding. 377 if (!isUInt<32>(AM.BaseOffs / 4)) 378 return false; 379 } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { 380 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 381 if (!isUInt<20>(AM.BaseOffs)) 382 return false; 383 } else 384 llvm_unreachable("unhandled generation"); 385 386 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 387 return true; 388 389 if (AM.Scale == 1 && AM.HasBaseReg) 390 return true; 391 392 return false; 393 } 394 395 case AMDGPUAS::PRIVATE_ADDRESS: 396 return isLegalMUBUFAddressingMode(AM); 397 398 case AMDGPUAS::LOCAL_ADDRESS: 399 case AMDGPUAS::REGION_ADDRESS: { 400 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 401 // field. 402 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 403 // an 8-bit dword offset but we don't know the alignment here. 404 if (!isUInt<16>(AM.BaseOffs)) 405 return false; 406 407 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 408 return true; 409 410 if (AM.Scale == 1 && AM.HasBaseReg) 411 return true; 412 413 return false; 414 } 415 case AMDGPUAS::FLAT_ADDRESS: 416 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 417 // For an unknown address space, this usually means that this is for some 418 // reason being used for pure arithmetic, and not based on some addressing 419 // computation. We don't have instructions that compute pointers with any 420 // addressing modes, so treat them as having no offset like flat 421 // instructions. 422 return isLegalFlatAddressingMode(AM); 423 424 default: 425 llvm_unreachable("unhandled address space"); 426 } 427} 428 429bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 430 unsigned AddrSpace, 431 unsigned Align, 432 bool *IsFast) const { 433 if (IsFast) 434 *IsFast = false; 435 436 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 437 // which isn't a simple VT. 438 if (!VT.isSimple() || VT == MVT::Other) 439 return false; 440 441 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 442 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 443 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 444 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 445 // with adjacent offsets. 446 bool AlignedBy4 = (Align % 4 == 0); 447 if (IsFast) 448 *IsFast = AlignedBy4; 449 450 return AlignedBy4; 451 } 452 453 if (Subtarget->hasUnalignedBufferAccess()) { 454 // If we have an uniform constant load, it still requires using a slow 455 // buffer instruction if unaligned. 456 if (IsFast) { 457 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? 458 (Align % 4 == 0) : true; 459 } 460 461 return true; 462 } 463 464 // Smaller than dword value must be aligned. 465 if (VT.bitsLT(MVT::i32)) 466 return false; 467 468 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 469 // byte-address are ignored, thus forcing Dword alignment. 470 // This applies to private, global, and constant memory. 471 if (IsFast) 472 *IsFast = true; 473 474 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 475} 476 477EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 478 unsigned SrcAlign, bool IsMemset, 479 bool ZeroMemset, 480 bool MemcpyStrSrc, 481 MachineFunction &MF) const { 482 // FIXME: Should account for address space here. 483 484 // The default fallback uses the private pointer size as a guess for a type to 485 // use. Make sure we switch these to 64-bit accesses. 486 487 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 488 return MVT::v4i32; 489 490 if (Size >= 8 && DstAlign >= 4) 491 return MVT::v2i32; 492 493 // Use the default. 494 return MVT::Other; 495} 496 497static bool isFlatGlobalAddrSpace(unsigned AS) { 498 return AS == AMDGPUAS::GLOBAL_ADDRESS || 499 AS == AMDGPUAS::FLAT_ADDRESS || 500 AS == AMDGPUAS::CONSTANT_ADDRESS; 501} 502 503bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 504 unsigned DestAS) const { 505 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); 506} 507 508bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 509 const MemSDNode *MemNode = cast<MemSDNode>(N); 510 const Value *Ptr = MemNode->getMemOperand()->getValue(); 511 512 // UndefValue means this is a load of a kernel input. These are uniform. 513 // Sometimes LDS instructions have constant pointers. 514 // If Ptr is null, then that means this mem operand contains a 515 // PseudoSourceValue like GOT. 516 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 517 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 518 return true; 519 520 const Instruction *I = dyn_cast<Instruction>(Ptr); 521 return I && I->getMetadata("amdgpu.uniform"); 522} 523 524TargetLoweringBase::LegalizeTypeAction 525SITargetLowering::getPreferredVectorAction(EVT VT) const { 526 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 527 return TypeSplitVector; 528 529 return TargetLoweringBase::getPreferredVectorAction(VT); 530} 531 532bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 533 Type *Ty) const { 534 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 535 return TII->isInlineConstant(Imm); 536} 537 538bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 539 540 // SimplifySetCC uses this function to determine whether or not it should 541 // create setcc with i1 operands. We don't have instructions for i1 setcc. 542 if (VT == MVT::i1 && Op == ISD::SETCC) 543 return false; 544 545 return TargetLowering::isTypeDesirableForOp(Op, VT); 546} 547 548SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, 549 const SDLoc &SL, SDValue Chain, 550 unsigned Offset) const { 551 const DataLayout &DL = DAG.getDataLayout(); 552 MachineFunction &MF = DAG.getMachineFunction(); 553 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 554 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 555 556 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 557 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 558 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 559 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 560 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 561 DAG.getConstant(Offset, SL, PtrVT)); 562} 563SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 564 const SDLoc &SL, SDValue Chain, 565 unsigned Offset, bool Signed) const { 566 const DataLayout &DL = DAG.getDataLayout(); 567 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 568 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 569 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 570 SDValue PtrOffset = DAG.getUNDEF(PtrVT); 571 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 572 573 unsigned Align = DL.getABITypeAlignment(Ty); 574 575 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 576 if (MemVT.isFloatingPoint()) 577 ExtTy = ISD::EXTLOAD; 578 579 SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); 580 return DAG.getLoad(ISD::UNINDEXED, ExtTy, 581 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 582 false, // isVolatile 583 true, // isNonTemporal 584 true, // isInvariant 585 Align); // Alignment 586} 587 588SDValue SITargetLowering::LowerFormalArguments( 589 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 590 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 591 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 592 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 593 594 MachineFunction &MF = DAG.getMachineFunction(); 595 FunctionType *FType = MF.getFunction()->getFunctionType(); 596 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 597 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 598 599 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { 600 const Function *Fn = MF.getFunction(); 601 DiagnosticInfoUnsupported NoGraphicsHSA( 602 *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 603 DAG.getContext()->diagnose(NoGraphicsHSA); 604 return DAG.getEntryNode(); 605 } 606 607 // Create stack objects that are used for emitting debugger prologue if 608 // "amdgpu-debugger-emit-prologue" attribute was specified. 609 if (ST.debuggerEmitPrologue()) 610 createDebuggerPrologueStackObjects(MF); 611 612 SmallVector<ISD::InputArg, 16> Splits; 613 BitVector Skipped(Ins.size()); 614 615 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 616 const ISD::InputArg &Arg = Ins[i]; 617 618 // First check if it's a PS input addr 619 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && 620 !Arg.Flags.isByVal() && PSInputNum <= 15) { 621 622 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 623 // We can safely skip PS inputs 624 Skipped.set(i); 625 ++PSInputNum; 626 continue; 627 } 628 629 Info->markPSInputAllocated(PSInputNum); 630 if (Arg.Used) 631 Info->PSInputEna |= 1 << PSInputNum; 632 633 ++PSInputNum; 634 } 635 636 if (AMDGPU::isShader(CallConv)) { 637 // Second split vertices into their elements 638 if (Arg.VT.isVector()) { 639 ISD::InputArg NewArg = Arg; 640 NewArg.Flags.setSplit(); 641 NewArg.VT = Arg.VT.getVectorElementType(); 642 643 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 644 // three or five element vertex only needs three or five registers, 645 // NOT four or eight. 646 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 647 unsigned NumElements = ParamType->getVectorNumElements(); 648 649 for (unsigned j = 0; j != NumElements; ++j) { 650 Splits.push_back(NewArg); 651 NewArg.PartOffset += NewArg.VT.getStoreSize(); 652 } 653 } else { 654 Splits.push_back(Arg); 655 } 656 } 657 } 658 659 SmallVector<CCValAssign, 16> ArgLocs; 660 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 661 *DAG.getContext()); 662 663 // At least one interpolation mode must be enabled or else the GPU will hang. 664 // 665 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set 666 // PSInputAddr, the user wants to enable some bits after the compilation 667 // based on run-time states. Since we can't know what the final PSInputEna 668 // will look like, so we shouldn't do anything here and the user should take 669 // responsibility for the correct programming. 670 // 671 // Otherwise, the following restrictions apply: 672 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 673 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 674 // enabled too. 675 if (CallConv == CallingConv::AMDGPU_PS && 676 ((Info->getPSInputAddr() & 0x7F) == 0 || 677 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { 678 CCInfo.AllocateReg(AMDGPU::VGPR0); 679 CCInfo.AllocateReg(AMDGPU::VGPR1); 680 Info->markPSInputAllocated(0); 681 Info->PSInputEna |= 1; 682 } 683 684 if (!AMDGPU::isShader(CallConv)) { 685 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 686 Splits); 687 688 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 689 } else { 690 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && 691 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && 692 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 693 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && 694 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && 695 !Info->hasWorkItemIDZ()); 696 } 697 698 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 699 if (Info->hasPrivateSegmentBuffer()) { 700 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); 701 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); 702 CCInfo.AllocateReg(PrivateSegmentBufferReg); 703 } 704 705 if (Info->hasDispatchPtr()) { 706 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); 707 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); 708 CCInfo.AllocateReg(DispatchPtrReg); 709 } 710 711 if (Info->hasQueuePtr()) { 712 unsigned QueuePtrReg = Info->addQueuePtr(*TRI); 713 MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); 714 CCInfo.AllocateReg(QueuePtrReg); 715 } 716 717 if (Info->hasKernargSegmentPtr()) { 718 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); 719 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 720 CCInfo.AllocateReg(InputPtrReg); 721 } 722 723 if (Info->hasFlatScratchInit()) { 724 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); 725 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); 726 CCInfo.AllocateReg(FlatScratchInitReg); 727 } 728 729 AnalyzeFormalArguments(CCInfo, Splits); 730 731 SmallVector<SDValue, 16> Chains; 732 733 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 734 735 const ISD::InputArg &Arg = Ins[i]; 736 if (Skipped[i]) { 737 InVals.push_back(DAG.getUNDEF(Arg.VT)); 738 continue; 739 } 740 741 CCValAssign &VA = ArgLocs[ArgIdx++]; 742 MVT VT = VA.getLocVT(); 743 744 if (VA.isMemLoc()) { 745 VT = Ins[i].VT; 746 EVT MemVT = Splits[i].VT; 747 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 748 VA.getLocMemOffset(); 749 // The first 36 bytes of the input buffer contains information about 750 // thread group and global sizes. 751 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 752 Offset, Ins[i].Flags.isSExt()); 753 Chains.push_back(Arg.getValue(1)); 754 755 auto *ParamTy = 756 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 757 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 758 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 759 // On SI local pointers are just offsets into LDS, so they are always 760 // less than 16-bits. On CI and newer they could potentially be 761 // real pointers, so we can't guarantee their size. 762 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 763 DAG.getValueType(MVT::i16)); 764 } 765 766 InVals.push_back(Arg); 767 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 768 continue; 769 } 770 assert(VA.isRegLoc() && "Parameter must be in a register!"); 771 772 unsigned Reg = VA.getLocReg(); 773 774 if (VT == MVT::i64) { 775 // For now assume it is a pointer 776 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 777 &AMDGPU::SReg_64RegClass); 778 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 779 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 780 InVals.push_back(Copy); 781 continue; 782 } 783 784 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 785 786 Reg = MF.addLiveIn(Reg, RC); 787 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 788 789 if (Arg.VT.isVector()) { 790 791 // Build a vector from the registers 792 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 793 unsigned NumElements = ParamType->getVectorNumElements(); 794 795 SmallVector<SDValue, 4> Regs; 796 Regs.push_back(Val); 797 for (unsigned j = 1; j != NumElements; ++j) { 798 Reg = ArgLocs[ArgIdx++].getLocReg(); 799 Reg = MF.addLiveIn(Reg, RC); 800 801 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 802 Regs.push_back(Copy); 803 } 804 805 // Fill up the missing vector elements 806 NumElements = Arg.VT.getVectorNumElements() - NumElements; 807 Regs.append(NumElements, DAG.getUNDEF(VT)); 808 809 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); 810 continue; 811 } 812 813 InVals.push_back(Val); 814 } 815 816 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 817 // these from the dispatch pointer. 818 819 // Start adding system SGPRs. 820 if (Info->hasWorkGroupIDX()) { 821 unsigned Reg = Info->addWorkGroupIDX(); 822 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 823 CCInfo.AllocateReg(Reg); 824 } 825 826 if (Info->hasWorkGroupIDY()) { 827 unsigned Reg = Info->addWorkGroupIDY(); 828 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 829 CCInfo.AllocateReg(Reg); 830 } 831 832 if (Info->hasWorkGroupIDZ()) { 833 unsigned Reg = Info->addWorkGroupIDZ(); 834 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 835 CCInfo.AllocateReg(Reg); 836 } 837 838 if (Info->hasWorkGroupInfo()) { 839 unsigned Reg = Info->addWorkGroupInfo(); 840 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 841 CCInfo.AllocateReg(Reg); 842 } 843 844 if (Info->hasPrivateSegmentWaveByteOffset()) { 845 // Scratch wave offset passed in system SGPR. 846 unsigned PrivateSegmentWaveByteOffsetReg; 847 848 if (AMDGPU::isShader(CallConv)) { 849 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 850 Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 851 } else 852 PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); 853 854 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 855 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 856 } 857 858 // Now that we've figured out where the scratch register inputs are, see if 859 // should reserve the arguments and use them directly. 860 bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); 861 // Record that we know we have non-spill stack objects so we don't need to 862 // check all stack objects later. 863 if (HasStackObjects) 864 Info->setHasNonSpillStackObjects(true); 865 866 if (ST.isAmdHsaOS()) { 867 // TODO: Assume we will spill without optimizations. 868 if (HasStackObjects) { 869 // If we have stack objects, we unquestionably need the private buffer 870 // resource. For the HSA ABI, this will be the first 4 user SGPR 871 // inputs. We can reserve those and use them directly. 872 873 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( 874 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 875 Info->setScratchRSrcReg(PrivateSegmentBufferReg); 876 877 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( 878 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 879 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 880 } else { 881 unsigned ReservedBufferReg 882 = TRI->reservedPrivateSegmentBufferReg(MF); 883 unsigned ReservedOffsetReg 884 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 885 886 // We tentatively reserve the last registers (skipping the last two 887 // which may contain VCC). After register allocation, we'll replace 888 // these with the ones immediately after those which were really 889 // allocated. In the prologue copies will be inserted from the argument 890 // to these reserved registers. 891 Info->setScratchRSrcReg(ReservedBufferReg); 892 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 893 } 894 } else { 895 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); 896 897 // Without HSA, relocations are used for the scratch pointer and the 898 // buffer resource setup is always inserted in the prologue. Scratch wave 899 // offset is still in an input SGPR. 900 Info->setScratchRSrcReg(ReservedBufferReg); 901 902 if (HasStackObjects) { 903 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( 904 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 905 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); 906 } else { 907 unsigned ReservedOffsetReg 908 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 909 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 910 } 911 } 912 913 if (Info->hasWorkItemIDX()) { 914 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); 915 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 916 CCInfo.AllocateReg(Reg); 917 } 918 919 if (Info->hasWorkItemIDY()) { 920 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); 921 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 922 CCInfo.AllocateReg(Reg); 923 } 924 925 if (Info->hasWorkItemIDZ()) { 926 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); 927 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 928 CCInfo.AllocateReg(Reg); 929 } 930 931 if (Chains.empty()) 932 return Chain; 933 934 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 935} 936 937SDValue 938SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 939 bool isVarArg, 940 const SmallVectorImpl<ISD::OutputArg> &Outs, 941 const SmallVectorImpl<SDValue> &OutVals, 942 const SDLoc &DL, SelectionDAG &DAG) const { 943 MachineFunction &MF = DAG.getMachineFunction(); 944 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 945 946 if (!AMDGPU::isShader(CallConv)) 947 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 948 OutVals, DL, DAG); 949 950 Info->setIfReturnsVoid(Outs.size() == 0); 951 952 SmallVector<ISD::OutputArg, 48> Splits; 953 SmallVector<SDValue, 48> SplitVals; 954 955 // Split vectors into their elements. 956 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 957 const ISD::OutputArg &Out = Outs[i]; 958 959 if (Out.VT.isVector()) { 960 MVT VT = Out.VT.getVectorElementType(); 961 ISD::OutputArg NewOut = Out; 962 NewOut.Flags.setSplit(); 963 NewOut.VT = VT; 964 965 // We want the original number of vector elements here, e.g. 966 // three or five, not four or eight. 967 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 968 969 for (unsigned j = 0; j != NumElements; ++j) { 970 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 971 DAG.getConstant(j, DL, MVT::i32)); 972 SplitVals.push_back(Elem); 973 Splits.push_back(NewOut); 974 NewOut.PartOffset += NewOut.VT.getStoreSize(); 975 } 976 } else { 977 SplitVals.push_back(OutVals[i]); 978 Splits.push_back(Out); 979 } 980 } 981 982 // CCValAssign - represent the assignment of the return value to a location. 983 SmallVector<CCValAssign, 48> RVLocs; 984 985 // CCState - Info about the registers and stack slots. 986 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 987 *DAG.getContext()); 988 989 // Analyze outgoing return values. 990 AnalyzeReturn(CCInfo, Splits); 991 992 SDValue Flag; 993 SmallVector<SDValue, 48> RetOps; 994 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 995 996 // Copy the result values into the output registers. 997 for (unsigned i = 0, realRVLocIdx = 0; 998 i != RVLocs.size(); 999 ++i, ++realRVLocIdx) { 1000 CCValAssign &VA = RVLocs[i]; 1001 assert(VA.isRegLoc() && "Can only return in registers!"); 1002 1003 SDValue Arg = SplitVals[realRVLocIdx]; 1004 1005 // Copied from other backends. 1006 switch (VA.getLocInfo()) { 1007 default: llvm_unreachable("Unknown loc info!"); 1008 case CCValAssign::Full: 1009 break; 1010 case CCValAssign::BCvt: 1011 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 1012 break; 1013 } 1014 1015 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 1016 Flag = Chain.getValue(1); 1017 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1018 } 1019 1020 // Update chain and glue. 1021 RetOps[0] = Chain; 1022 if (Flag.getNode()) 1023 RetOps.push_back(Flag); 1024 1025 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; 1026 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 1027} 1028 1029unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 1030 SelectionDAG &DAG) const { 1031 unsigned Reg = StringSwitch<unsigned>(RegName) 1032 .Case("m0", AMDGPU::M0) 1033 .Case("exec", AMDGPU::EXEC) 1034 .Case("exec_lo", AMDGPU::EXEC_LO) 1035 .Case("exec_hi", AMDGPU::EXEC_HI) 1036 .Case("flat_scratch", AMDGPU::FLAT_SCR) 1037 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 1038 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 1039 .Default(AMDGPU::NoRegister); 1040 1041 if (Reg == AMDGPU::NoRegister) { 1042 report_fatal_error(Twine("invalid register name \"" 1043 + StringRef(RegName) + "\".")); 1044 1045 } 1046 1047 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 1048 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 1049 report_fatal_error(Twine("invalid register \"" 1050 + StringRef(RegName) + "\" for subtarget.")); 1051 } 1052 1053 switch (Reg) { 1054 case AMDGPU::M0: 1055 case AMDGPU::EXEC_LO: 1056 case AMDGPU::EXEC_HI: 1057 case AMDGPU::FLAT_SCR_LO: 1058 case AMDGPU::FLAT_SCR_HI: 1059 if (VT.getSizeInBits() == 32) 1060 return Reg; 1061 break; 1062 case AMDGPU::EXEC: 1063 case AMDGPU::FLAT_SCR: 1064 if (VT.getSizeInBits() == 64) 1065 return Reg; 1066 break; 1067 default: 1068 llvm_unreachable("missing register type checking"); 1069 } 1070 1071 report_fatal_error(Twine("invalid type for register \"" 1072 + StringRef(RegName) + "\".")); 1073} 1074 1075// If kill is not the last instruction, split the block so kill is always a 1076// proper terminator. 1077MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, 1078 MachineBasicBlock *BB) const { 1079 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1080 1081 MachineBasicBlock::iterator SplitPoint(&MI); 1082 ++SplitPoint; 1083 1084 if (SplitPoint == BB->end()) { 1085 // Don't bother with a new block. 1086 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1087 return BB; 1088 } 1089 1090 MachineFunction *MF = BB->getParent(); 1091 MachineBasicBlock *SplitBB 1092 = MF->CreateMachineBasicBlock(BB->getBasicBlock()); 1093 1094 SmallSet<unsigned, 8> SplitDefRegs; 1095 for (auto I = SplitPoint, E = BB->end(); I != E; ++I) { 1096 for (MachineOperand &Def : I->defs()) 1097 SplitDefRegs.insert(Def.getReg()); 1098 } 1099 1100 // Fix the block phi references to point to the new block for the defs in the 1101 // second piece of the block. 1102 for (MachineBasicBlock *Succ : BB->successors()) { 1103 for (MachineInstr &MI : *Succ) { 1104 if (!MI.isPHI()) 1105 break; 1106 1107 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1108 unsigned IncomingReg = MI.getOperand(I).getReg(); 1109 MachineOperand &FromBB = MI.getOperand(I + 1); 1110 if (BB == FromBB.getMBB()) { 1111 if (SplitDefRegs.count(IncomingReg)) 1112 FromBB.setMBB(SplitBB); 1113 1114 break; 1115 } 1116 } 1117 } 1118 } 1119 1120 MF->insert(++MachineFunction::iterator(BB), SplitBB); 1121 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); 1122 1123 1124 SplitBB->transferSuccessors(BB); 1125 BB->addSuccessor(SplitBB); 1126 1127 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1128 return SplitBB; 1129} 1130 1131MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 1132 MachineInstr &MI, MachineBasicBlock *BB) const { 1133 switch (MI.getOpcode()) { 1134 case AMDGPU::SI_INIT_M0: { 1135 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1136 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 1137 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1138 .addOperand(MI.getOperand(0)); 1139 MI.eraseFromParent(); 1140 break; 1141 } 1142 case AMDGPU::BRANCH: 1143 return BB; 1144 case AMDGPU::GET_GROUPSTATICSIZE: { 1145 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1146 1147 MachineFunction *MF = BB->getParent(); 1148 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1149 DebugLoc DL = MI.getDebugLoc(); 1150 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) 1151 .addOperand(MI.getOperand(0)) 1152 .addImm(MFI->LDSSize); 1153 MI.eraseFromParent(); 1154 return BB; 1155 } 1156 case AMDGPU::SI_KILL: 1157 return splitKillBlock(MI, BB); 1158 default: 1159 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 1160 } 1161 return BB; 1162} 1163 1164bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1165 // This currently forces unfolding various combinations of fsub into fma with 1166 // free fneg'd operands. As long as we have fast FMA (controlled by 1167 // isFMAFasterThanFMulAndFAdd), we should perform these. 1168 1169 // When fma is quarter rate, for f64 where add / sub are at best half rate, 1170 // most of these combines appear to be cycle neutral but save on instruction 1171 // count / code size. 1172 return true; 1173} 1174 1175EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 1176 EVT VT) const { 1177 if (!VT.isVector()) { 1178 return MVT::i1; 1179 } 1180 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 1181} 1182 1183MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 1184 return MVT::i32; 1185} 1186 1187// Answering this is somewhat tricky and depends on the specific device which 1188// have different rates for fma or all f64 operations. 1189// 1190// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 1191// regardless of which device (although the number of cycles differs between 1192// devices), so it is always profitable for f64. 1193// 1194// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 1195// only on full rate devices. Normally, we should prefer selecting v_mad_f32 1196// which we can always do even without fused FP ops since it returns the same 1197// result as the separate operations and since it is always full 1198// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 1199// however does not support denormals, so we do report fma as faster if we have 1200// a fast fma device and require denormals. 1201// 1202bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 1203 VT = VT.getScalarType(); 1204 1205 if (!VT.isSimple()) 1206 return false; 1207 1208 switch (VT.getSimpleVT().SimpleTy) { 1209 case MVT::f32: 1210 // This is as fast on some subtargets. However, we always have full rate f32 1211 // mad available which returns the same result as the separate operations 1212 // which we should prefer over fma. We can't use this if we want to support 1213 // denormals, so only report this in these cases. 1214 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 1215 case MVT::f64: 1216 return true; 1217 default: 1218 break; 1219 } 1220 1221 return false; 1222} 1223 1224//===----------------------------------------------------------------------===// 1225// Custom DAG Lowering Operations 1226//===----------------------------------------------------------------------===// 1227 1228SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1229 switch (Op.getOpcode()) { 1230 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1231 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 1232 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 1233 case ISD::LOAD: { 1234 SDValue Result = LowerLOAD(Op, DAG); 1235 assert((!Result.getNode() || 1236 Result.getNode()->getNumValues() == 2) && 1237 "Load should return a value and a chain"); 1238 return Result; 1239 } 1240 1241 case ISD::FSIN: 1242 case ISD::FCOS: 1243 return LowerTrig(Op, DAG); 1244 case ISD::SELECT: return LowerSELECT(Op, DAG); 1245 case ISD::FDIV: return LowerFDIV(Op, DAG); 1246 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 1247 case ISD::STORE: return LowerSTORE(Op, DAG); 1248 case ISD::GlobalAddress: { 1249 MachineFunction &MF = DAG.getMachineFunction(); 1250 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1251 return LowerGlobalAddress(MFI, Op, DAG); 1252 } 1253 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 1254 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 1255 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 1256 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 1257 case ISD::TRAP: return lowerTRAP(Op, DAG); 1258 } 1259 return SDValue(); 1260} 1261 1262/// \brief Helper function for LowerBRCOND 1263static SDNode *findUser(SDValue Value, unsigned Opcode) { 1264 1265 SDNode *Parent = Value.getNode(); 1266 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 1267 I != E; ++I) { 1268 1269 if (I.getUse().get() != Value) 1270 continue; 1271 1272 if (I->getOpcode() == Opcode) 1273 return *I; 1274 } 1275 return nullptr; 1276} 1277 1278SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 1279 1280 SDLoc SL(Op); 1281 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 1282 unsigned FrameIndex = FINode->getIndex(); 1283 1284 // A FrameIndex node represents a 32-bit offset into scratch memory. If the 1285 // high bit of a frame index offset were to be set, this would mean that it 1286 // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch 1287 // buffer, with 64 being the number of threads per wave. 1288 // 1289 // The maximum private allocation for the entire GPU is 4G, and we are 1290 // concerned with the largest the index could ever be for an individual 1291 // workitem. This will occur with the minmum dispatch size. If a program 1292 // requires more, the dispatch size will be reduced. 1293 // 1294 // With this limit, we can mark the high bit of the FrameIndex node as known 1295 // zero, which is important, because it means in most situations we can prove 1296 // that values derived from FrameIndex nodes are non-negative. This enables us 1297 // to take advantage of more addressing modes when accessing scratch buffers, 1298 // since for scratch reads/writes, the register offset must always be 1299 // positive. 1300 1301 uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; 1302 1303 // XXX - It is unclear if partial dispatch works. Assume it works at half wave 1304 // granularity. It is probably a full wave. 1305 uint64_t MinGranularity = 32; 1306 1307 unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); 1308 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); 1309 1310 SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 1311 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, 1312 DAG.getValueType(ExtVT)); 1313} 1314 1315bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 1316 if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) 1317 return false; 1318 1319 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { 1320 default: return false; 1321 case AMDGPUIntrinsic::amdgcn_if: 1322 case AMDGPUIntrinsic::amdgcn_else: 1323 case AMDGPUIntrinsic::amdgcn_break: 1324 case AMDGPUIntrinsic::amdgcn_if_break: 1325 case AMDGPUIntrinsic::amdgcn_else_break: 1326 case AMDGPUIntrinsic::amdgcn_loop: 1327 case AMDGPUIntrinsic::amdgcn_end_cf: 1328 return true; 1329 } 1330} 1331 1332void SITargetLowering::createDebuggerPrologueStackObjects( 1333 MachineFunction &MF) const { 1334 // Create stack objects that are used for emitting debugger prologue. 1335 // 1336 // Debugger prologue writes work group IDs and work item IDs to scratch memory 1337 // at fixed location in the following format: 1338 // offset 0: work group ID x 1339 // offset 4: work group ID y 1340 // offset 8: work group ID z 1341 // offset 16: work item ID x 1342 // offset 20: work item ID y 1343 // offset 24: work item ID z 1344 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1345 int ObjectIdx = 0; 1346 1347 // For each dimension: 1348 for (unsigned i = 0; i < 3; ++i) { 1349 // Create fixed stack object for work group ID. 1350 ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); 1351 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); 1352 // Create fixed stack object for work item ID. 1353 ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); 1354 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); 1355 } 1356} 1357 1358/// This transforms the control flow intrinsics to get the branch destination as 1359/// last parameter, also switches branch target with BR if the need arise 1360SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 1361 SelectionDAG &DAG) const { 1362 1363 SDLoc DL(BRCOND); 1364 1365 SDNode *Intr = BRCOND.getOperand(1).getNode(); 1366 SDValue Target = BRCOND.getOperand(2); 1367 SDNode *BR = nullptr; 1368 SDNode *SetCC = nullptr; 1369 1370 if (Intr->getOpcode() == ISD::SETCC) { 1371 // As long as we negate the condition everything is fine 1372 SetCC = Intr; 1373 Intr = SetCC->getOperand(0).getNode(); 1374 1375 } else { 1376 // Get the target from BR if we don't negate the condition 1377 BR = findUser(BRCOND, ISD::BR); 1378 Target = BR->getOperand(1); 1379 } 1380 1381 if (!isCFIntrinsic(Intr)) { 1382 // This is a uniform branch so we don't need to legalize. 1383 return BRCOND; 1384 } 1385 1386 assert(!SetCC || 1387 (SetCC->getConstantOperandVal(1) == 1 && 1388 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 1389 ISD::SETNE)); 1390 1391 // Build the result and 1392 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 1393 1394 // operands of the new intrinsic call 1395 SmallVector<SDValue, 4> Ops; 1396 Ops.push_back(BRCOND.getOperand(0)); 1397 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 1398 Ops.push_back(Target); 1399 1400 // build the new intrinsic call 1401 SDNode *Result = DAG.getNode( 1402 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 1403 DAG.getVTList(Res), Ops).getNode(); 1404 1405 if (BR) { 1406 // Give the branch instruction our target 1407 SDValue Ops[] = { 1408 BR->getOperand(0), 1409 BRCOND.getOperand(2) 1410 }; 1411 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 1412 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 1413 BR = NewBR.getNode(); 1414 } 1415 1416 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 1417 1418 // Copy the intrinsic results to registers 1419 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 1420 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 1421 if (!CopyToReg) 1422 continue; 1423 1424 Chain = DAG.getCopyToReg( 1425 Chain, DL, 1426 CopyToReg->getOperand(1), 1427 SDValue(Result, i - 1), 1428 SDValue()); 1429 1430 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 1431 } 1432 1433 // Remove the old intrinsic from the chain 1434 DAG.ReplaceAllUsesOfValueWith( 1435 SDValue(Intr, Intr->getNumValues() - 1), 1436 Intr->getOperand(0)); 1437 1438 return Chain; 1439} 1440 1441SDValue SITargetLowering::getSegmentAperture(unsigned AS, 1442 SelectionDAG &DAG) const { 1443 SDLoc SL; 1444 MachineFunction &MF = DAG.getMachineFunction(); 1445 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1446 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 1447 assert(UserSGPR != AMDGPU::NoRegister); 1448 1449 SDValue QueuePtr = CreateLiveInRegister( 1450 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 1451 1452 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1453 // private_segment_aperture_base_hi. 1454 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1455 1456 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, 1457 DAG.getConstant(StructOffset, SL, MVT::i64)); 1458 1459 // TODO: Use custom target PseudoSourceValue. 1460 // TODO: We should use the value from the IR intrinsic call, but it might not 1461 // be available and how do we get it? 1462 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), 1463 AMDGPUAS::CONSTANT_ADDRESS)); 1464 1465 MachinePointerInfo PtrInfo(V, StructOffset); 1466 return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, 1467 PtrInfo, false, 1468 false, true, 1469 MinAlign(64, StructOffset)); 1470} 1471 1472SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 1473 SelectionDAG &DAG) const { 1474 SDLoc SL(Op); 1475 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); 1476 1477 SDValue Src = ASC->getOperand(0); 1478 1479 // FIXME: Really support non-0 null pointers. 1480 SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); 1481 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 1482 1483 // flat -> local/private 1484 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 1485 if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1486 ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1487 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 1488 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 1489 1490 return DAG.getNode(ISD::SELECT, SL, MVT::i32, 1491 NonNull, Ptr, SegmentNullPtr); 1492 } 1493 } 1494 1495 // local/private -> flat 1496 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 1497 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1498 ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1499 SDValue NonNull 1500 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 1501 1502 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); 1503 SDValue CvtPtr 1504 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 1505 1506 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, 1507 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), 1508 FlatNullPtr); 1509 } 1510 } 1511 1512 // global <-> flat are no-ops and never emitted. 1513 1514 const MachineFunction &MF = DAG.getMachineFunction(); 1515 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1516 *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 1517 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 1518 1519 return DAG.getUNDEF(ASC->getValueType(0)); 1520} 1521 1522static bool shouldEmitGOTReloc(const GlobalValue *GV, 1523 const TargetMachine &TM) { 1524 return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 1525 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 1526} 1527 1528bool 1529SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 1530 // We can fold offsets for anything that doesn't require a GOT relocation. 1531 return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 1532 !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); 1533} 1534 1535static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 1536 SDLoc DL, unsigned Offset, EVT PtrVT, 1537 unsigned GAFlags = SIInstrInfo::MO_NONE) { 1538 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 1539 // lowered to the following code sequence: 1540 // s_getpc_b64 s[0:1] 1541 // s_add_u32 s0, s0, $symbol 1542 // s_addc_u32 s1, s1, 0 1543 // 1544 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1545 // a fixup or relocation is emitted to replace $symbol with a literal 1546 // constant, which is a pc-relative offset from the encoding of the $symbol 1547 // operand to the global variable. 1548 // 1549 // What we want here is an offset from the value returned by s_getpc 1550 // (which is the address of the s_add_u32 instruction) to the global 1551 // variable, but since the encoding of $symbol starts 4 bytes after the start 1552 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1553 // small. This requires us to add 4 to the global variable offset in order to 1554 // compute the correct address. 1555 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 1556 GAFlags); 1557 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); 1558} 1559 1560SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 1561 SDValue Op, 1562 SelectionDAG &DAG) const { 1563 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 1564 1565 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && 1566 GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) 1567 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 1568 1569 SDLoc DL(GSD); 1570 const GlobalValue *GV = GSD->getGlobal(); 1571 EVT PtrVT = Op.getValueType(); 1572 1573 if (!shouldEmitGOTReloc(GV, getTargetMachine())) 1574 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 1575 1576 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 1577 SIInstrInfo::MO_GOTPCREL); 1578 1579 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 1580 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 1581 const DataLayout &DataLayout = DAG.getDataLayout(); 1582 unsigned Align = DataLayout.getABITypeAlignment(PtrTy); 1583 // FIXME: Use a PseudoSourceValue once those can be assigned an address space. 1584 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 1585 1586 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, 1587 PtrInfo, false, false, true, Align); 1588} 1589 1590SDValue SITargetLowering::lowerTRAP(SDValue Op, 1591 SelectionDAG &DAG) const { 1592 const MachineFunction &MF = DAG.getMachineFunction(); 1593 DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), 1594 "trap handler not supported", 1595 Op.getDebugLoc(), 1596 DS_Warning); 1597 DAG.getContext()->diagnose(NoTrap); 1598 1599 // Emit s_endpgm. 1600 1601 // FIXME: This should really be selected to s_trap, but that requires 1602 // setting up the trap handler for it o do anything. 1603 return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, 1604 Op.getOperand(0)); 1605} 1606 1607SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 1608 const SDLoc &DL, SDValue V) const { 1609 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 1610 // the destination register. 1611 // 1612 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 1613 // so we will end up with redundant moves to m0. 1614 // 1615 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 1616 1617 // A Null SDValue creates a glue result. 1618 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 1619 V, Chain); 1620 return SDValue(M0, 0); 1621} 1622 1623SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 1624 SDValue Op, 1625 MVT VT, 1626 unsigned Offset) const { 1627 SDLoc SL(Op); 1628 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, 1629 DAG.getEntryNode(), Offset, false); 1630 // The local size values will have the hi 16-bits as zero. 1631 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 1632 DAG.getValueType(VT)); 1633} 1634 1635static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 1636 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 1637 "non-hsa intrinsic with hsa target", 1638 DL.getDebugLoc()); 1639 DAG.getContext()->diagnose(BadIntrin); 1640 return DAG.getUNDEF(VT); 1641} 1642 1643static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 1644 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 1645 "intrinsic not supported on subtarget", 1646 DL.getDebugLoc()); 1647 DAG.getContext()->diagnose(BadIntrin); 1648 return DAG.getUNDEF(VT); 1649} 1650 1651SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 1652 SelectionDAG &DAG) const { 1653 MachineFunction &MF = DAG.getMachineFunction(); 1654 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 1655 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 1656 1657 EVT VT = Op.getValueType(); 1658 SDLoc DL(Op); 1659 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1660 1661 // TODO: Should this propagate fast-math-flags? 1662 1663 switch (IntrinsicID) { 1664 case Intrinsic::amdgcn_dispatch_ptr: 1665 case Intrinsic::amdgcn_queue_ptr: { 1666 if (!Subtarget->isAmdHsaOS()) { 1667 DiagnosticInfoUnsupported BadIntrin( 1668 *MF.getFunction(), "unsupported hsa intrinsic without hsa target", 1669 DL.getDebugLoc()); 1670 DAG.getContext()->diagnose(BadIntrin); 1671 return DAG.getUNDEF(VT); 1672 } 1673 1674 auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 1675 SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; 1676 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, 1677 TRI->getPreloadedValue(MF, Reg), VT); 1678 } 1679 case Intrinsic::amdgcn_implicitarg_ptr: { 1680 unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 1681 return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); 1682 } 1683 case Intrinsic::amdgcn_kernarg_segment_ptr: { 1684 unsigned Reg 1685 = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 1686 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 1687 } 1688 case Intrinsic::amdgcn_rcp: 1689 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 1690 case Intrinsic::amdgcn_rsq: 1691 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 1692 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 1693 case Intrinsic::amdgcn_rsq_legacy: { 1694 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 1695 return emitRemovedIntrinsicError(DAG, DL, VT); 1696 1697 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 1698 } 1699 case Intrinsic::amdgcn_rsq_clamp: 1700 case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name 1701 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 1702 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 1703 1704 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 1705 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 1706 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 1707 1708 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 1709 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 1710 DAG.getConstantFP(Max, DL, VT)); 1711 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 1712 DAG.getConstantFP(Min, DL, VT)); 1713 } 1714 case Intrinsic::r600_read_ngroups_x: 1715 if (Subtarget->isAmdHsaOS()) 1716 return emitNonHSAIntrinsicError(DAG, DL, VT); 1717 1718 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1719 SI::KernelInputOffsets::NGROUPS_X, false); 1720 case Intrinsic::r600_read_ngroups_y: 1721 if (Subtarget->isAmdHsaOS()) 1722 return emitNonHSAIntrinsicError(DAG, DL, VT); 1723 1724 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1725 SI::KernelInputOffsets::NGROUPS_Y, false); 1726 case Intrinsic::r600_read_ngroups_z: 1727 if (Subtarget->isAmdHsaOS()) 1728 return emitNonHSAIntrinsicError(DAG, DL, VT); 1729 1730 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1731 SI::KernelInputOffsets::NGROUPS_Z, false); 1732 case Intrinsic::r600_read_global_size_x: 1733 if (Subtarget->isAmdHsaOS()) 1734 return emitNonHSAIntrinsicError(DAG, DL, VT); 1735 1736 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1737 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 1738 case Intrinsic::r600_read_global_size_y: 1739 if (Subtarget->isAmdHsaOS()) 1740 return emitNonHSAIntrinsicError(DAG, DL, VT); 1741 1742 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1743 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 1744 case Intrinsic::r600_read_global_size_z: 1745 if (Subtarget->isAmdHsaOS()) 1746 return emitNonHSAIntrinsicError(DAG, DL, VT); 1747 1748 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1749 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 1750 case Intrinsic::r600_read_local_size_x: 1751 if (Subtarget->isAmdHsaOS()) 1752 return emitNonHSAIntrinsicError(DAG, DL, VT); 1753 1754 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1755 SI::KernelInputOffsets::LOCAL_SIZE_X); 1756 case Intrinsic::r600_read_local_size_y: 1757 if (Subtarget->isAmdHsaOS()) 1758 return emitNonHSAIntrinsicError(DAG, DL, VT); 1759 1760 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1761 SI::KernelInputOffsets::LOCAL_SIZE_Y); 1762 case Intrinsic::r600_read_local_size_z: 1763 if (Subtarget->isAmdHsaOS()) 1764 return emitNonHSAIntrinsicError(DAG, DL, VT); 1765 1766 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1767 SI::KernelInputOffsets::LOCAL_SIZE_Z); 1768 case Intrinsic::amdgcn_read_workdim: 1769 case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. 1770 // Really only 2 bits. 1771 return lowerImplicitZextParam(DAG, Op, MVT::i8, 1772 getImplicitParameterOffset(MFI, GRID_DIM)); 1773 case Intrinsic::amdgcn_workgroup_id_x: 1774 case Intrinsic::r600_read_tgid_x: 1775 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1776 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); 1777 case Intrinsic::amdgcn_workgroup_id_y: 1778 case Intrinsic::r600_read_tgid_y: 1779 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1780 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); 1781 case Intrinsic::amdgcn_workgroup_id_z: 1782 case Intrinsic::r600_read_tgid_z: 1783 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1784 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); 1785 case Intrinsic::amdgcn_workitem_id_x: 1786 case Intrinsic::r600_read_tidig_x: 1787 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1788 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); 1789 case Intrinsic::amdgcn_workitem_id_y: 1790 case Intrinsic::r600_read_tidig_y: 1791 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1792 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); 1793 case Intrinsic::amdgcn_workitem_id_z: 1794 case Intrinsic::r600_read_tidig_z: 1795 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1796 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); 1797 case AMDGPUIntrinsic::SI_load_const: { 1798 SDValue Ops[] = { 1799 Op.getOperand(1), 1800 Op.getOperand(2) 1801 }; 1802 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 MachinePointerInfo(), 1805 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 1806 VT.getStoreSize(), 4); 1807 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 1808 Op->getVTList(), Ops, VT, MMO); 1809 } 1810 case AMDGPUIntrinsic::SI_vs_load_input: 1811 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 1812 Op.getOperand(1), 1813 Op.getOperand(2), 1814 Op.getOperand(3)); 1815 1816 case AMDGPUIntrinsic::SI_fs_constant: { 1817 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1818 SDValue Glue = M0.getValue(1); 1819 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 1820 DAG.getConstant(2, DL, MVT::i32), // P0 1821 Op.getOperand(1), Op.getOperand(2), Glue); 1822 } 1823 case AMDGPUIntrinsic::SI_packf16: 1824 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 1825 return DAG.getUNDEF(MVT::i32); 1826 return Op; 1827 case AMDGPUIntrinsic::SI_fs_interp: { 1828 SDValue IJ = Op.getOperand(4); 1829 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1830 DAG.getConstant(0, DL, MVT::i32)); 1831 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1832 DAG.getConstant(1, DL, MVT::i32)); 1833 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1834 SDValue Glue = M0.getValue(1); 1835 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 1836 DAG.getVTList(MVT::f32, MVT::Glue), 1837 I, Op.getOperand(1), Op.getOperand(2), Glue); 1838 Glue = SDValue(P1.getNode(), 1); 1839 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 1840 Op.getOperand(1), Op.getOperand(2), Glue); 1841 } 1842 case Intrinsic::amdgcn_interp_p1: { 1843 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 1844 SDValue Glue = M0.getValue(1); 1845 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 1846 Op.getOperand(2), Op.getOperand(3), Glue); 1847 } 1848 case Intrinsic::amdgcn_interp_p2: { 1849 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 1850 SDValue Glue = SDValue(M0.getNode(), 1); 1851 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 1852 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 1853 Glue); 1854 } 1855 case Intrinsic::amdgcn_sin: 1856 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 1857 1858 case Intrinsic::amdgcn_cos: 1859 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 1860 1861 case Intrinsic::amdgcn_log_clamp: { 1862 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 1863 return SDValue(); 1864 1865 DiagnosticInfoUnsupported BadIntrin( 1866 *MF.getFunction(), "intrinsic not supported on subtarget", 1867 DL.getDebugLoc()); 1868 DAG.getContext()->diagnose(BadIntrin); 1869 return DAG.getUNDEF(VT); 1870 } 1871 case Intrinsic::amdgcn_ldexp: 1872 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 1873 Op.getOperand(1), Op.getOperand(2)); 1874 1875 case Intrinsic::amdgcn_fract: 1876 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 1877 1878 case Intrinsic::amdgcn_class: 1879 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 1880 Op.getOperand(1), Op.getOperand(2)); 1881 case Intrinsic::amdgcn_div_fmas: 1882 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 1883 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 1884 Op.getOperand(4)); 1885 1886 case Intrinsic::amdgcn_div_fixup: 1887 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 1888 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 1889 1890 case Intrinsic::amdgcn_trig_preop: 1891 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 1892 Op.getOperand(1), Op.getOperand(2)); 1893 case Intrinsic::amdgcn_div_scale: { 1894 // 3rd parameter required to be a constant. 1895 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 1896 if (!Param) 1897 return DAG.getUNDEF(VT); 1898 1899 // Translate to the operands expected by the machine instruction. The 1900 // first parameter must be the same as the first instruction. 1901 SDValue Numerator = Op.getOperand(1); 1902 SDValue Denominator = Op.getOperand(2); 1903 1904 // Note this order is opposite of the machine instruction's operations, 1905 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 1906 // intrinsic has the numerator as the first operand to match a normal 1907 // division operation. 1908 1909 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 1910 1911 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 1912 Denominator, Numerator); 1913 } 1914 default: 1915 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1916 } 1917} 1918 1919SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 1920 SelectionDAG &DAG) const { 1921 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1922 switch (IntrID) { 1923 case Intrinsic::amdgcn_atomic_inc: 1924 case Intrinsic::amdgcn_atomic_dec: { 1925 MemSDNode *M = cast<MemSDNode>(Op); 1926 unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? 1927 AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; 1928 SDValue Ops[] = { 1929 M->getOperand(0), // Chain 1930 M->getOperand(2), // Ptr 1931 M->getOperand(3) // Value 1932 }; 1933 1934 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, 1935 M->getMemoryVT(), M->getMemOperand()); 1936 } 1937 default: 1938 return SDValue(); 1939 } 1940} 1941 1942SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 1943 SelectionDAG &DAG) const { 1944 MachineFunction &MF = DAG.getMachineFunction(); 1945 SDLoc DL(Op); 1946 SDValue Chain = Op.getOperand(0); 1947 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1948 1949 switch (IntrinsicID) { 1950 case AMDGPUIntrinsic::SI_sendmsg: { 1951 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 1952 SDValue Glue = Chain.getValue(1); 1953 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 1954 Op.getOperand(2), Glue); 1955 } 1956 case AMDGPUIntrinsic::SI_tbuffer_store: { 1957 SDValue Ops[] = { 1958 Chain, 1959 Op.getOperand(2), 1960 Op.getOperand(3), 1961 Op.getOperand(4), 1962 Op.getOperand(5), 1963 Op.getOperand(6), 1964 Op.getOperand(7), 1965 Op.getOperand(8), 1966 Op.getOperand(9), 1967 Op.getOperand(10), 1968 Op.getOperand(11), 1969 Op.getOperand(12), 1970 Op.getOperand(13), 1971 Op.getOperand(14) 1972 }; 1973 1974 EVT VT = Op.getOperand(3).getValueType(); 1975 1976 MachineMemOperand *MMO = MF.getMachineMemOperand( 1977 MachinePointerInfo(), 1978 MachineMemOperand::MOStore, 1979 VT.getStoreSize(), 4); 1980 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 1981 Op->getVTList(), Ops, VT, MMO); 1982 } 1983 case AMDGPUIntrinsic::AMDGPU_kill: { 1984 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) { 1985 if (!K->isNegative()) 1986 return Chain; 1987 } 1988 1989 return Op; 1990 } 1991 default: 1992 return SDValue(); 1993 } 1994} 1995 1996SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1997 SDLoc DL(Op); 1998 LoadSDNode *Load = cast<LoadSDNode>(Op); 1999 ISD::LoadExtType ExtType = Load->getExtensionType(); 2000 EVT MemVT = Load->getMemoryVT(); 2001 2002 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 2003 assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); 2004 // FIXME: Copied from PPC 2005 // First, load into 32 bits, then truncate to 1 bit. 2006 2007 SDValue Chain = Load->getChain(); 2008 SDValue BasePtr = Load->getBasePtr(); 2009 MachineMemOperand *MMO = Load->getMemOperand(); 2010 2011 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 2012 BasePtr, MVT::i8, MMO); 2013 2014 SDValue Ops[] = { 2015 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 2016 NewLD.getValue(1) 2017 }; 2018 2019 return DAG.getMergeValues(Ops, DL); 2020 } 2021 2022 if (!MemVT.isVector()) 2023 return SDValue(); 2024 2025 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 2026 "Custom lowering for non-i32 vectors hasn't been implemented."); 2027 2028 unsigned AS = Load->getAddressSpace(); 2029 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 2030 AS, Load->getAlignment())) { 2031 SDValue Ops[2]; 2032 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2033 return DAG.getMergeValues(Ops, DL); 2034 } 2035 2036 unsigned NumElements = MemVT.getVectorNumElements(); 2037 switch (AS) { 2038 case AMDGPUAS::CONSTANT_ADDRESS: 2039 if (isMemOpUniform(Load)) 2040 return SDValue(); 2041 // Non-uniform loads will be selected to MUBUF instructions, so they 2042 // have the same legalization requires ments as global and private 2043 // loads. 2044 // 2045 // Fall-through 2046 case AMDGPUAS::GLOBAL_ADDRESS: 2047 case AMDGPUAS::FLAT_ADDRESS: 2048 if (NumElements > 4) 2049 return SplitVectorLoad(Op, DAG); 2050 // v4 loads are supported for private and global memory. 2051 return SDValue(); 2052 case AMDGPUAS::PRIVATE_ADDRESS: { 2053 // Depending on the setting of the private_element_size field in the 2054 // resource descriptor, we can only make private accesses up to a certain 2055 // size. 2056 switch (Subtarget->getMaxPrivateElementSize()) { 2057 case 4: 2058 return scalarizeVectorLoad(Load, DAG); 2059 case 8: 2060 if (NumElements > 2) 2061 return SplitVectorLoad(Op, DAG); 2062 return SDValue(); 2063 case 16: 2064 // Same as global/flat 2065 if (NumElements > 4) 2066 return SplitVectorLoad(Op, DAG); 2067 return SDValue(); 2068 default: 2069 llvm_unreachable("unsupported private_element_size"); 2070 } 2071 } 2072 case AMDGPUAS::LOCAL_ADDRESS: { 2073 if (NumElements > 2) 2074 return SplitVectorLoad(Op, DAG); 2075 2076 if (NumElements == 2) 2077 return SDValue(); 2078 2079 // If properly aligned, if we split we might be able to use ds_read_b64. 2080 return SplitVectorLoad(Op, DAG); 2081 } 2082 default: 2083 return SDValue(); 2084 } 2085} 2086 2087SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2088 if (Op.getValueType() != MVT::i64) 2089 return SDValue(); 2090 2091 SDLoc DL(Op); 2092 SDValue Cond = Op.getOperand(0); 2093 2094 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2095 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2096 2097 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 2098 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 2099 2100 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 2101 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 2102 2103 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 2104 2105 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 2106 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 2107 2108 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 2109 2110 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 2111 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 2112} 2113 2114// Catch division cases where we can use shortcuts with rcp and rsq 2115// instructions. 2116SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 2117 SDLoc SL(Op); 2118 SDValue LHS = Op.getOperand(0); 2119 SDValue RHS = Op.getOperand(1); 2120 EVT VT = Op.getValueType(); 2121 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 2122 2123 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 2124 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 2125 CLHS->isExactlyValue(1.0)) { 2126 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 2127 // the CI documentation has a worst case error of 1 ulp. 2128 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 2129 // use it as long as we aren't trying to use denormals. 2130 2131 // 1.0 / sqrt(x) -> rsq(x) 2132 // 2133 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 2134 // error seems really high at 2^29 ULP. 2135 if (RHS.getOpcode() == ISD::FSQRT) 2136 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 2137 2138 // 1.0 / x -> rcp(x) 2139 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2140 } 2141 } 2142 2143 const SDNodeFlags *Flags = Op->getFlags(); 2144 2145 if (Unsafe || Flags->hasAllowReciprocal()) { 2146 // Turn into multiply by the reciprocal. 2147 // x / y -> x * (1.0 / y) 2148 SDNodeFlags Flags; 2149 Flags.setUnsafeAlgebra(true); 2150 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2151 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 2152 } 2153 2154 return SDValue(); 2155} 2156 2157SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 2158 if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) 2159 return FastLowered; 2160 2161 SDLoc SL(Op); 2162 SDValue LHS = Op.getOperand(0); 2163 SDValue RHS = Op.getOperand(1); 2164 2165 // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag 2166 if (EnableAMDGPUFastFDIV) { 2167 // This does not support denormals. 2168 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 2169 2170 const APFloat K0Val(BitsToFloat(0x6f800000)); 2171 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 2172 2173 const APFloat K1Val(BitsToFloat(0x2f800000)); 2174 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 2175 2176 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2177 2178 EVT SetCCVT = 2179 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 2180 2181 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 2182 2183 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 2184 2185 // TODO: Should this propagate fast-math-flags? 2186 2187 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 2188 2189 // rcp does not support denormals. 2190 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 2191 2192 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 2193 2194 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 2195 } 2196 2197 // Generates more precise fpdiv32. 2198 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2199 2200 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 2201 2202 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); 2203 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); 2204 2205 // Denominator is scaled to not be denormal, so using rcp is ok. 2206 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); 2207 2208 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); 2209 2210 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); 2211 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); 2212 2213 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); 2214 2215 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); 2216 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); 2217 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); 2218 2219 SDValue Scale = NumeratorScaled.getValue(1); 2220 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); 2221 2222 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); 2223} 2224 2225SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 2226 if (DAG.getTarget().Options.UnsafeFPMath) 2227 return LowerFastFDIV(Op, DAG); 2228 2229 SDLoc SL(Op); 2230 SDValue X = Op.getOperand(0); 2231 SDValue Y = Op.getOperand(1); 2232 2233 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2234 2235 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 2236 2237 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 2238 2239 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 2240 2241 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 2242 2243 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 2244 2245 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 2246 2247 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 2248 2249 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 2250 2251 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 2252 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 2253 2254 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 2255 NegDivScale0, Mul, DivScale1); 2256 2257 SDValue Scale; 2258 2259 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 2260 // Workaround a hardware bug on SI where the condition output from div_scale 2261 // is not usable. 2262 2263 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 2264 2265 // Figure out if the scale to use for div_fmas. 2266 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 2267 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 2268 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 2269 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 2270 2271 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 2272 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 2273 2274 SDValue Scale0Hi 2275 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 2276 SDValue Scale1Hi 2277 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 2278 2279 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 2280 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 2281 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 2282 } else { 2283 Scale = DivScale1.getValue(1); 2284 } 2285 2286 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 2287 Fma4, Fma3, Mul, Scale); 2288 2289 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 2290} 2291 2292SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 2293 EVT VT = Op.getValueType(); 2294 2295 if (VT == MVT::f32) 2296 return LowerFDIV32(Op, DAG); 2297 2298 if (VT == MVT::f64) 2299 return LowerFDIV64(Op, DAG); 2300 2301 llvm_unreachable("Unexpected type for fdiv"); 2302} 2303 2304SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2305 SDLoc DL(Op); 2306 StoreSDNode *Store = cast<StoreSDNode>(Op); 2307 EVT VT = Store->getMemoryVT(); 2308 2309 if (VT == MVT::i1) { 2310 return DAG.getTruncStore(Store->getChain(), DL, 2311 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 2312 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 2313 } 2314 2315 assert(VT.isVector() && 2316 Store->getValue().getValueType().getScalarType() == MVT::i32); 2317 2318 unsigned AS = Store->getAddressSpace(); 2319 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 2320 AS, Store->getAlignment())) { 2321 return expandUnalignedStore(Store, DAG); 2322 } 2323 2324 unsigned NumElements = VT.getVectorNumElements(); 2325 switch (AS) { 2326 case AMDGPUAS::GLOBAL_ADDRESS: 2327 case AMDGPUAS::FLAT_ADDRESS: 2328 if (NumElements > 4) 2329 return SplitVectorStore(Op, DAG); 2330 return SDValue(); 2331 case AMDGPUAS::PRIVATE_ADDRESS: { 2332 switch (Subtarget->getMaxPrivateElementSize()) { 2333 case 4: 2334 return scalarizeVectorStore(Store, DAG); 2335 case 8: 2336 if (NumElements > 2) 2337 return SplitVectorStore(Op, DAG); 2338 return SDValue(); 2339 case 16: 2340 if (NumElements > 4) 2341 return SplitVectorStore(Op, DAG); 2342 return SDValue(); 2343 default: 2344 llvm_unreachable("unsupported private_element_size"); 2345 } 2346 } 2347 case AMDGPUAS::LOCAL_ADDRESS: { 2348 if (NumElements > 2) 2349 return SplitVectorStore(Op, DAG); 2350 2351 if (NumElements == 2) 2352 return Op; 2353 2354 // If properly aligned, if we split we might be able to use ds_write_b64. 2355 return SplitVectorStore(Op, DAG); 2356 } 2357 default: 2358 llvm_unreachable("unhandled address space"); 2359 } 2360} 2361 2362SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 2363 SDLoc DL(Op); 2364 EVT VT = Op.getValueType(); 2365 SDValue Arg = Op.getOperand(0); 2366 // TODO: Should this propagate fast-math-flags? 2367 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 2368 DAG.getNode(ISD::FMUL, DL, VT, Arg, 2369 DAG.getConstantFP(0.5/M_PI, DL, 2370 VT))); 2371 2372 switch (Op.getOpcode()) { 2373 case ISD::FCOS: 2374 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 2375 case ISD::FSIN: 2376 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 2377 default: 2378 llvm_unreachable("Wrong trig opcode"); 2379 } 2380} 2381 2382SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 2383 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 2384 assert(AtomicNode->isCompareAndSwap()); 2385 unsigned AS = AtomicNode->getAddressSpace(); 2386 2387 // No custom lowering required for local address space 2388 if (!isFlatGlobalAddrSpace(AS)) 2389 return Op; 2390 2391 // Non-local address space requires custom lowering for atomic compare 2392 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 2393 SDLoc DL(Op); 2394 SDValue ChainIn = Op.getOperand(0); 2395 SDValue Addr = Op.getOperand(1); 2396 SDValue Old = Op.getOperand(2); 2397 SDValue New = Op.getOperand(3); 2398 EVT VT = Op.getValueType(); 2399 MVT SimpleVT = VT.getSimpleVT(); 2400 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 2401 2402 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 2403 SDValue Ops[] = { ChainIn, Addr, NewOld }; 2404 2405 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 2406 Ops, VT, AtomicNode->getMemOperand()); 2407} 2408 2409//===----------------------------------------------------------------------===// 2410// Custom DAG optimizations 2411//===----------------------------------------------------------------------===// 2412 2413SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 2414 DAGCombinerInfo &DCI) const { 2415 EVT VT = N->getValueType(0); 2416 EVT ScalarVT = VT.getScalarType(); 2417 if (ScalarVT != MVT::f32) 2418 return SDValue(); 2419 2420 SelectionDAG &DAG = DCI.DAG; 2421 SDLoc DL(N); 2422 2423 SDValue Src = N->getOperand(0); 2424 EVT SrcVT = Src.getValueType(); 2425 2426 // TODO: We could try to match extracting the higher bytes, which would be 2427 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 2428 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 2429 // about in practice. 2430 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 2431 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 2432 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 2433 DCI.AddToWorklist(Cvt.getNode()); 2434 return Cvt; 2435 } 2436 } 2437 2438 return SDValue(); 2439} 2440 2441/// \brief Return true if the given offset Size in bytes can be folded into 2442/// the immediate offsets of a memory instruction for the given address space. 2443static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 2444 const SISubtarget &STI) { 2445 switch (AS) { 2446 case AMDGPUAS::GLOBAL_ADDRESS: { 2447 // MUBUF instructions a 12-bit offset in bytes. 2448 return isUInt<12>(OffsetSize); 2449 } 2450 case AMDGPUAS::CONSTANT_ADDRESS: { 2451 // SMRD instructions have an 8-bit offset in dwords on SI and 2452 // a 20-bit offset in bytes on VI. 2453 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2454 return isUInt<20>(OffsetSize); 2455 else 2456 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 2457 } 2458 case AMDGPUAS::LOCAL_ADDRESS: 2459 case AMDGPUAS::REGION_ADDRESS: { 2460 // The single offset versions have a 16-bit offset in bytes. 2461 return isUInt<16>(OffsetSize); 2462 } 2463 case AMDGPUAS::PRIVATE_ADDRESS: 2464 // Indirect register addressing does not use any offsets. 2465 default: 2466 return 0; 2467 } 2468} 2469 2470// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 2471 2472// This is a variant of 2473// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 2474// 2475// The normal DAG combiner will do this, but only if the add has one use since 2476// that would increase the number of instructions. 2477// 2478// This prevents us from seeing a constant offset that can be folded into a 2479// memory instruction's addressing mode. If we know the resulting add offset of 2480// a pointer can be folded into an addressing offset, we can replace the pointer 2481// operand with the add of new constant offset. This eliminates one of the uses, 2482// and may allow the remaining use to also be simplified. 2483// 2484SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 2485 unsigned AddrSpace, 2486 DAGCombinerInfo &DCI) const { 2487 SDValue N0 = N->getOperand(0); 2488 SDValue N1 = N->getOperand(1); 2489 2490 if (N0.getOpcode() != ISD::ADD) 2491 return SDValue(); 2492 2493 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 2494 if (!CN1) 2495 return SDValue(); 2496 2497 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 2498 if (!CAdd) 2499 return SDValue(); 2500 2501 // If the resulting offset is too large, we can't fold it into the addressing 2502 // mode offset. 2503 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 2504 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) 2505 return SDValue(); 2506 2507 SelectionDAG &DAG = DCI.DAG; 2508 SDLoc SL(N); 2509 EVT VT = N->getValueType(0); 2510 2511 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 2512 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 2513 2514 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 2515} 2516 2517SDValue SITargetLowering::performAndCombine(SDNode *N, 2518 DAGCombinerInfo &DCI) const { 2519 if (DCI.isBeforeLegalize()) 2520 return SDValue(); 2521 2522 if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) 2523 return Base; 2524 2525 SelectionDAG &DAG = DCI.DAG; 2526 2527 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 2528 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 2529 SDValue LHS = N->getOperand(0); 2530 SDValue RHS = N->getOperand(1); 2531 2532 if (LHS.getOpcode() == ISD::SETCC && 2533 RHS.getOpcode() == ISD::SETCC) { 2534 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 2535 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 2536 2537 SDValue X = LHS.getOperand(0); 2538 SDValue Y = RHS.getOperand(0); 2539 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 2540 return SDValue(); 2541 2542 if (LCC == ISD::SETO) { 2543 if (X != LHS.getOperand(1)) 2544 return SDValue(); 2545 2546 if (RCC == ISD::SETUNE) { 2547 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 2548 if (!C1 || !C1->isInfinity() || C1->isNegative()) 2549 return SDValue(); 2550 2551 const uint32_t Mask = SIInstrFlags::N_NORMAL | 2552 SIInstrFlags::N_SUBNORMAL | 2553 SIInstrFlags::N_ZERO | 2554 SIInstrFlags::P_ZERO | 2555 SIInstrFlags::P_SUBNORMAL | 2556 SIInstrFlags::P_NORMAL; 2557 2558 static_assert(((~(SIInstrFlags::S_NAN | 2559 SIInstrFlags::Q_NAN | 2560 SIInstrFlags::N_INFINITY | 2561 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 2562 "mask not equal"); 2563 2564 SDLoc DL(N); 2565 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 2566 X, DAG.getConstant(Mask, DL, MVT::i32)); 2567 } 2568 } 2569 } 2570 2571 return SDValue(); 2572} 2573 2574SDValue SITargetLowering::performOrCombine(SDNode *N, 2575 DAGCombinerInfo &DCI) const { 2576 SelectionDAG &DAG = DCI.DAG; 2577 SDValue LHS = N->getOperand(0); 2578 SDValue RHS = N->getOperand(1); 2579 2580 EVT VT = N->getValueType(0); 2581 if (VT == MVT::i64) { 2582 // TODO: This could be a generic combine with a predicate for extracting the 2583 // high half of an integer being free. 2584 2585 // (or i64:x, (zero_extend i32:y)) -> 2586 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 2587 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 2588 RHS.getOpcode() != ISD::ZERO_EXTEND) 2589 std::swap(LHS, RHS); 2590 2591 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 2592 SDValue ExtSrc = RHS.getOperand(0); 2593 EVT SrcVT = ExtSrc.getValueType(); 2594 if (SrcVT == MVT::i32) { 2595 SDLoc SL(N); 2596 SDValue LowLHS, HiBits; 2597 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 2598 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 2599 2600 DCI.AddToWorklist(LowOr.getNode()); 2601 DCI.AddToWorklist(HiBits.getNode()); 2602 2603 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 2604 LowOr, HiBits); 2605 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 2606 } 2607 } 2608 } 2609 2610 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 2611 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 2612 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 2613 SDValue Src = LHS.getOperand(0); 2614 if (Src != RHS.getOperand(0)) 2615 return SDValue(); 2616 2617 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 2618 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 2619 if (!CLHS || !CRHS) 2620 return SDValue(); 2621 2622 // Only 10 bits are used. 2623 static const uint32_t MaxMask = 0x3ff; 2624 2625 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 2626 SDLoc DL(N); 2627 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 2628 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 2629 } 2630 2631 return SDValue(); 2632} 2633 2634SDValue SITargetLowering::performClassCombine(SDNode *N, 2635 DAGCombinerInfo &DCI) const { 2636 SelectionDAG &DAG = DCI.DAG; 2637 SDValue Mask = N->getOperand(1); 2638 2639 // fp_class x, 0 -> false 2640 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 2641 if (CMask->isNullValue()) 2642 return DAG.getConstant(0, SDLoc(N), MVT::i1); 2643 } 2644 2645 if (N->getOperand(0).isUndef()) 2646 return DAG.getUNDEF(MVT::i1); 2647 2648 return SDValue(); 2649} 2650 2651// Constant fold canonicalize. 2652SDValue SITargetLowering::performFCanonicalizeCombine( 2653 SDNode *N, 2654 DAGCombinerInfo &DCI) const { 2655 ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 2656 if (!CFP) 2657 return SDValue(); 2658 2659 SelectionDAG &DAG = DCI.DAG; 2660 const APFloat &C = CFP->getValueAPF(); 2661 2662 // Flush denormals to 0 if not enabled. 2663 if (C.isDenormal()) { 2664 EVT VT = N->getValueType(0); 2665 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) 2666 return DAG.getConstantFP(0.0, SDLoc(N), VT); 2667 2668 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) 2669 return DAG.getConstantFP(0.0, SDLoc(N), VT); 2670 } 2671 2672 if (C.isNaN()) { 2673 EVT VT = N->getValueType(0); 2674 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 2675 if (C.isSignaling()) { 2676 // Quiet a signaling NaN. 2677 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 2678 } 2679 2680 // Make sure it is the canonical NaN bitpattern. 2681 // 2682 // TODO: Can we use -1 as the canonical NaN value since it's an inline 2683 // immediate? 2684 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 2685 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 2686 } 2687 2688 return SDValue(CFP, 0); 2689} 2690 2691static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 2692 switch (Opc) { 2693 case ISD::FMAXNUM: 2694 return AMDGPUISD::FMAX3; 2695 case ISD::SMAX: 2696 return AMDGPUISD::SMAX3; 2697 case ISD::UMAX: 2698 return AMDGPUISD::UMAX3; 2699 case ISD::FMINNUM: 2700 return AMDGPUISD::FMIN3; 2701 case ISD::SMIN: 2702 return AMDGPUISD::SMIN3; 2703 case ISD::UMIN: 2704 return AMDGPUISD::UMIN3; 2705 default: 2706 llvm_unreachable("Not a min/max opcode"); 2707 } 2708} 2709 2710static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 2711 SDValue Op0, SDValue Op1, bool Signed) { 2712 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); 2713 if (!K1) 2714 return SDValue(); 2715 2716 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 2717 if (!K0) 2718 return SDValue(); 2719 2720 if (Signed) { 2721 if (K0->getAPIntValue().sge(K1->getAPIntValue())) 2722 return SDValue(); 2723 } else { 2724 if (K0->getAPIntValue().uge(K1->getAPIntValue())) 2725 return SDValue(); 2726 } 2727 2728 EVT VT = K0->getValueType(0); 2729 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, 2730 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); 2731} 2732 2733static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { 2734 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) 2735 return true; 2736 2737 return DAG.isKnownNeverNaN(Op); 2738} 2739 2740static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 2741 SDValue Op0, SDValue Op1) { 2742 ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); 2743 if (!K1) 2744 return SDValue(); 2745 2746 ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); 2747 if (!K0) 2748 return SDValue(); 2749 2750 // Ordered >= (although NaN inputs should have folded away by now). 2751 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); 2752 if (Cmp == APFloat::cmpGreaterThan) 2753 return SDValue(); 2754 2755 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 2756 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then 2757 // give the other result, which is different from med3 with a NaN input. 2758 SDValue Var = Op0.getOperand(0); 2759 if (!isKnownNeverSNan(DAG, Var)) 2760 return SDValue(); 2761 2762 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 2763 Var, SDValue(K0, 0), SDValue(K1, 0)); 2764} 2765 2766SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 2767 DAGCombinerInfo &DCI) const { 2768 SelectionDAG &DAG = DCI.DAG; 2769 2770 unsigned Opc = N->getOpcode(); 2771 SDValue Op0 = N->getOperand(0); 2772 SDValue Op1 = N->getOperand(1); 2773 2774 // Only do this if the inner op has one use since this will just increases 2775 // register pressure for no benefit. 2776 2777 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { 2778 // max(max(a, b), c) -> max3(a, b, c) 2779 // min(min(a, b), c) -> min3(a, b, c) 2780 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 2781 SDLoc DL(N); 2782 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 2783 DL, 2784 N->getValueType(0), 2785 Op0.getOperand(0), 2786 Op0.getOperand(1), 2787 Op1); 2788 } 2789 2790 // Try commuted. 2791 // max(a, max(b, c)) -> max3(a, b, c) 2792 // min(a, min(b, c)) -> min3(a, b, c) 2793 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 2794 SDLoc DL(N); 2795 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 2796 DL, 2797 N->getValueType(0), 2798 Op0, 2799 Op1.getOperand(0), 2800 Op1.getOperand(1)); 2801 } 2802 } 2803 2804 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 2805 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 2806 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) 2807 return Med3; 2808 } 2809 2810 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 2811 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) 2812 return Med3; 2813 } 2814 2815 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 2816 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 2817 (Opc == AMDGPUISD::FMIN_LEGACY && 2818 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 2819 N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { 2820 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 2821 return Res; 2822 } 2823 2824 return SDValue(); 2825} 2826 2827SDValue SITargetLowering::performSetCCCombine(SDNode *N, 2828 DAGCombinerInfo &DCI) const { 2829 SelectionDAG &DAG = DCI.DAG; 2830 SDLoc SL(N); 2831 2832 SDValue LHS = N->getOperand(0); 2833 SDValue RHS = N->getOperand(1); 2834 EVT VT = LHS.getValueType(); 2835 2836 if (VT != MVT::f32 && VT != MVT::f64) 2837 return SDValue(); 2838 2839 // Match isinf pattern 2840 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 2841 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 2842 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 2843 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 2844 if (!CRHS) 2845 return SDValue(); 2846 2847 const APFloat &APF = CRHS->getValueAPF(); 2848 if (APF.isInfinity() && !APF.isNegative()) { 2849 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 2850 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 2851 DAG.getConstant(Mask, SL, MVT::i32)); 2852 } 2853 } 2854 2855 return SDValue(); 2856} 2857 2858SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 2859 DAGCombinerInfo &DCI) const { 2860 SelectionDAG &DAG = DCI.DAG; 2861 SDLoc DL(N); 2862 2863 switch (N->getOpcode()) { 2864 default: 2865 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2866 case ISD::SETCC: 2867 return performSetCCCombine(N, DCI); 2868 case ISD::FMAXNUM: 2869 case ISD::FMINNUM: 2870 case ISD::SMAX: 2871 case ISD::SMIN: 2872 case ISD::UMAX: 2873 case ISD::UMIN: 2874 case AMDGPUISD::FMIN_LEGACY: 2875 case AMDGPUISD::FMAX_LEGACY: { 2876 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 2877 N->getValueType(0) != MVT::f64 && 2878 getTargetMachine().getOptLevel() > CodeGenOpt::None) 2879 return performMinMaxCombine(N, DCI); 2880 break; 2881 } 2882 2883 case AMDGPUISD::CVT_F32_UBYTE0: 2884 case AMDGPUISD::CVT_F32_UBYTE1: 2885 case AMDGPUISD::CVT_F32_UBYTE2: 2886 case AMDGPUISD::CVT_F32_UBYTE3: { 2887 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 2888 SDValue Src = N->getOperand(0); 2889 2890 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. 2891 if (Src.getOpcode() == ISD::SRL) { 2892 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 2893 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 2894 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 2895 2896 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) { 2897 unsigned SrcOffset = C->getZExtValue() + 8 * Offset; 2898 if (SrcOffset < 32 && SrcOffset % 8 == 0) { 2899 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, 2900 MVT::f32, Src.getOperand(0)); 2901 } 2902 } 2903 } 2904 2905 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 2906 2907 APInt KnownZero, KnownOne; 2908 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 2909 !DCI.isBeforeLegalizeOps()); 2910 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2911 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 2912 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 2913 DCI.CommitTargetLoweringOpt(TLO); 2914 } 2915 2916 break; 2917 } 2918 2919 case ISD::UINT_TO_FP: { 2920 return performUCharToFloatCombine(N, DCI); 2921 } 2922 case ISD::FADD: { 2923 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 2924 break; 2925 2926 EVT VT = N->getValueType(0); 2927 if (VT != MVT::f32) 2928 break; 2929 2930 // Only do this if we are not trying to support denormals. v_mad_f32 does 2931 // not support denormals ever. 2932 if (Subtarget->hasFP32Denormals()) 2933 break; 2934 2935 SDValue LHS = N->getOperand(0); 2936 SDValue RHS = N->getOperand(1); 2937 2938 // These should really be instruction patterns, but writing patterns with 2939 // source modiifiers is a pain. 2940 2941 // fadd (fadd (a, a), b) -> mad 2.0, a, b 2942 if (LHS.getOpcode() == ISD::FADD) { 2943 SDValue A = LHS.getOperand(0); 2944 if (A == LHS.getOperand(1)) { 2945 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2946 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 2947 } 2948 } 2949 2950 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 2951 if (RHS.getOpcode() == ISD::FADD) { 2952 SDValue A = RHS.getOperand(0); 2953 if (A == RHS.getOperand(1)) { 2954 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2955 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 2956 } 2957 } 2958 2959 return SDValue(); 2960 } 2961 case ISD::FSUB: { 2962 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 2963 break; 2964 2965 EVT VT = N->getValueType(0); 2966 2967 // Try to get the fneg to fold into the source modifier. This undoes generic 2968 // DAG combines and folds them into the mad. 2969 // 2970 // Only do this if we are not trying to support denormals. v_mad_f32 does 2971 // not support denormals ever. 2972 if (VT == MVT::f32 && 2973 !Subtarget->hasFP32Denormals()) { 2974 SDValue LHS = N->getOperand(0); 2975 SDValue RHS = N->getOperand(1); 2976 if (LHS.getOpcode() == ISD::FADD) { 2977 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 2978 2979 SDValue A = LHS.getOperand(0); 2980 if (A == LHS.getOperand(1)) { 2981 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2982 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 2983 2984 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 2985 } 2986 } 2987 2988 if (RHS.getOpcode() == ISD::FADD) { 2989 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 2990 2991 SDValue A = RHS.getOperand(0); 2992 if (A == RHS.getOperand(1)) { 2993 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 2994 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 2995 } 2996 } 2997 2998 return SDValue(); 2999 } 3000 3001 break; 3002 } 3003 case ISD::LOAD: 3004 case ISD::STORE: 3005 case ISD::ATOMIC_LOAD: 3006 case ISD::ATOMIC_STORE: 3007 case ISD::ATOMIC_CMP_SWAP: 3008 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 3009 case ISD::ATOMIC_SWAP: 3010 case ISD::ATOMIC_LOAD_ADD: 3011 case ISD::ATOMIC_LOAD_SUB: 3012 case ISD::ATOMIC_LOAD_AND: 3013 case ISD::ATOMIC_LOAD_OR: 3014 case ISD::ATOMIC_LOAD_XOR: 3015 case ISD::ATOMIC_LOAD_NAND: 3016 case ISD::ATOMIC_LOAD_MIN: 3017 case ISD::ATOMIC_LOAD_MAX: 3018 case ISD::ATOMIC_LOAD_UMIN: 3019 case ISD::ATOMIC_LOAD_UMAX: 3020 case AMDGPUISD::ATOMIC_INC: 3021 case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. 3022 if (DCI.isBeforeLegalize()) 3023 break; 3024 3025 MemSDNode *MemNode = cast<MemSDNode>(N); 3026 SDValue Ptr = MemNode->getBasePtr(); 3027 3028 // TODO: We could also do this for multiplies. 3029 unsigned AS = MemNode->getAddressSpace(); 3030 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 3031 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 3032 if (NewPtr) { 3033 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 3034 3035 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 3036 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 3037 } 3038 } 3039 break; 3040 } 3041 case ISD::AND: 3042 return performAndCombine(N, DCI); 3043 case ISD::OR: 3044 return performOrCombine(N, DCI); 3045 case AMDGPUISD::FP_CLASS: 3046 return performClassCombine(N, DCI); 3047 case ISD::FCANONICALIZE: 3048 return performFCanonicalizeCombine(N, DCI); 3049 case AMDGPUISD::FRACT: 3050 case AMDGPUISD::RCP: 3051 case AMDGPUISD::RSQ: 3052 case AMDGPUISD::RSQ_LEGACY: 3053 case AMDGPUISD::RSQ_CLAMP: 3054 case AMDGPUISD::LDEXP: { 3055 SDValue Src = N->getOperand(0); 3056 if (Src.isUndef()) 3057 return Src; 3058 break; 3059 } 3060 } 3061 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 3062} 3063 3064/// \brief Analyze the possible immediate value Op 3065/// 3066/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 3067/// and the immediate value if it's a literal immediate 3068int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 3069 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3070 3071 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 3072 if (TII->isInlineConstant(Node->getAPIntValue())) 3073 return 0; 3074 3075 uint64_t Val = Node->getZExtValue(); 3076 return isUInt<32>(Val) ? Val : -1; 3077 } 3078 3079 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 3080 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 3081 return 0; 3082 3083 if (Node->getValueType(0) == MVT::f32) 3084 return FloatToBits(Node->getValueAPF().convertToFloat()); 3085 3086 return -1; 3087 } 3088 3089 return -1; 3090} 3091 3092/// \brief Helper function for adjustWritemask 3093static unsigned SubIdx2Lane(unsigned Idx) { 3094 switch (Idx) { 3095 default: return 0; 3096 case AMDGPU::sub0: return 0; 3097 case AMDGPU::sub1: return 1; 3098 case AMDGPU::sub2: return 2; 3099 case AMDGPU::sub3: return 3; 3100 } 3101} 3102 3103/// \brief Adjust the writemask of MIMG instructions 3104void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 3105 SelectionDAG &DAG) const { 3106 SDNode *Users[4] = { }; 3107 unsigned Lane = 0; 3108 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; 3109 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 3110 unsigned NewDmask = 0; 3111 3112 // Try to figure out the used register components 3113 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 3114 I != E; ++I) { 3115 3116 // Abort if we can't understand the usage 3117 if (!I->isMachineOpcode() || 3118 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 3119 return; 3120 3121 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 3122 // Note that subregs are packed, i.e. Lane==0 is the first bit set 3123 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 3124 // set, etc. 3125 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 3126 3127 // Set which texture component corresponds to the lane. 3128 unsigned Comp; 3129 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 3130 assert(Dmask); 3131 Comp = countTrailingZeros(Dmask); 3132 Dmask &= ~(1 << Comp); 3133 } 3134 3135 // Abort if we have more than one user per component 3136 if (Users[Lane]) 3137 return; 3138 3139 Users[Lane] = *I; 3140 NewDmask |= 1 << Comp; 3141 } 3142 3143 // Abort if there's no change 3144 if (NewDmask == OldDmask) 3145 return; 3146 3147 // Adjust the writemask in the node 3148 std::vector<SDValue> Ops; 3149 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 3150 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 3151 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 3152 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 3153 3154 // If we only got one lane, replace it with a copy 3155 // (if NewDmask has only one bit set...) 3156 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 3157 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 3158 MVT::i32); 3159 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 3160 SDLoc(), Users[Lane]->getValueType(0), 3161 SDValue(Node, 0), RC); 3162 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 3163 return; 3164 } 3165 3166 // Update the users of the node with the new indices 3167 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 3168 3169 SDNode *User = Users[i]; 3170 if (!User) 3171 continue; 3172 3173 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 3174 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 3175 3176 switch (Idx) { 3177 default: break; 3178 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 3179 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 3180 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 3181 } 3182 } 3183} 3184 3185static bool isFrameIndexOp(SDValue Op) { 3186 if (Op.getOpcode() == ISD::AssertZext) 3187 Op = Op.getOperand(0); 3188 3189 return isa<FrameIndexSDNode>(Op); 3190} 3191 3192/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 3193/// with frame index operands. 3194/// LLVM assumes that inputs are to these instructions are registers. 3195void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 3196 SelectionDAG &DAG) const { 3197 3198 SmallVector<SDValue, 8> Ops; 3199 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 3200 if (!isFrameIndexOp(Node->getOperand(i))) { 3201 Ops.push_back(Node->getOperand(i)); 3202 continue; 3203 } 3204 3205 SDLoc DL(Node); 3206 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 3207 Node->getOperand(i).getValueType(), 3208 Node->getOperand(i)), 0)); 3209 } 3210 3211 DAG.UpdateNodeOperands(Node, Ops); 3212} 3213 3214/// \brief Fold the instructions after selecting them. 3215SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 3216 SelectionDAG &DAG) const { 3217 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3218 unsigned Opcode = Node->getMachineOpcode(); 3219 3220 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && 3221 !TII->isGather4(Opcode)) 3222 adjustWritemask(Node, DAG); 3223 3224 if (Opcode == AMDGPU::INSERT_SUBREG || 3225 Opcode == AMDGPU::REG_SEQUENCE) { 3226 legalizeTargetIndependentNode(Node, DAG); 3227 return Node; 3228 } 3229 return Node; 3230} 3231 3232/// \brief Assign the register class depending on the number of 3233/// bits set in the writemask 3234void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 3235 SDNode *Node) const { 3236 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3237 3238 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3239 3240 if (TII->isVOP3(MI.getOpcode())) { 3241 // Make sure constant bus requirements are respected. 3242 TII->legalizeOperandsVOP3(MRI, MI); 3243 return; 3244 } 3245 3246 if (TII->isMIMG(MI)) { 3247 unsigned VReg = MI.getOperand(0).getReg(); 3248 unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; 3249 unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); 3250 unsigned BitsSet = 0; 3251 for (unsigned i = 0; i < 4; ++i) 3252 BitsSet += Writemask & (1 << i) ? 1 : 0; 3253 3254 const TargetRegisterClass *RC; 3255 switch (BitsSet) { 3256 default: return; 3257 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 3258 case 2: RC = &AMDGPU::VReg_64RegClass; break; 3259 case 3: RC = &AMDGPU::VReg_96RegClass; break; 3260 } 3261 3262 unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); 3263 MI.setDesc(TII->get(NewOpcode)); 3264 MRI.setRegClass(VReg, RC); 3265 return; 3266 } 3267 3268 // Replace unused atomics with the no return version. 3269 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); 3270 if (NoRetAtomicOp != -1) { 3271 if (!Node->hasAnyUseOfValue(0)) { 3272 MI.setDesc(TII->get(NoRetAtomicOp)); 3273 MI.RemoveOperand(0); 3274 return; 3275 } 3276 3277 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg 3278 // instruction, because the return type of these instructions is a vec2 of 3279 // the memory type, so it can be tied to the input operand. 3280 // This means these instructions always have a use, so we need to add a 3281 // special case to check if the atomic has only one extract_subreg use, 3282 // which itself has no uses. 3283 if ((Node->hasNUsesOfValue(1, 0) && 3284 Node->use_begin()->isMachineOpcode() && 3285 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && 3286 !Node->use_begin()->hasAnyUseOfValue(0))) { 3287 unsigned Def = MI.getOperand(0).getReg(); 3288 3289 // Change this into a noret atomic. 3290 MI.setDesc(TII->get(NoRetAtomicOp)); 3291 MI.RemoveOperand(0); 3292 3293 // If we only remove the def operand from the atomic instruction, the 3294 // extract_subreg will be left with a use of a vreg without a def. 3295 // So we need to insert an implicit_def to avoid machine verifier 3296 // errors. 3297 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 3298 TII->get(AMDGPU::IMPLICIT_DEF), Def); 3299 } 3300 return; 3301 } 3302} 3303 3304static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 3305 uint64_t Val) { 3306 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 3307 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 3308} 3309 3310MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 3311 const SDLoc &DL, 3312 SDValue Ptr) const { 3313 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3314 3315 // Build the half of the subregister with the constants before building the 3316 // full 128-bit register. If we are building multiple resource descriptors, 3317 // this will allow CSEing of the 2-component register. 3318 const SDValue Ops0[] = { 3319 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 3320 buildSMovImm32(DAG, DL, 0), 3321 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 3322 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 3323 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 3324 }; 3325 3326 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 3327 MVT::v2i32, Ops0), 0); 3328 3329 // Combine the constants and the pointer. 3330 const SDValue Ops1[] = { 3331 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 3332 Ptr, 3333 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 3334 SubRegHi, 3335 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 3336 }; 3337 3338 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 3339} 3340 3341/// \brief Return a resource descriptor with the 'Add TID' bit enabled 3342/// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 3343/// of the resource descriptor) to create an offset, which is added to 3344/// the resource pointer. 3345MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 3346 SDValue Ptr, uint32_t RsrcDword1, 3347 uint64_t RsrcDword2And3) const { 3348 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 3349 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 3350 if (RsrcDword1) { 3351 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 3352 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 3353 0); 3354 } 3355 3356 SDValue DataLo = buildSMovImm32(DAG, DL, 3357 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 3358 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 3359 3360 const SDValue Ops[] = { 3361 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 3362 PtrLo, 3363 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 3364 PtrHi, 3365 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 3366 DataLo, 3367 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 3368 DataHi, 3369 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 3370 }; 3371 3372 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 3373} 3374 3375SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 3376 const TargetRegisterClass *RC, 3377 unsigned Reg, EVT VT) const { 3378 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 3379 3380 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 3381 cast<RegisterSDNode>(VReg)->getReg(), VT); 3382} 3383 3384//===----------------------------------------------------------------------===// 3385// SI Inline Assembly Support 3386//===----------------------------------------------------------------------===// 3387 3388std::pair<unsigned, const TargetRegisterClass *> 3389SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 3390 StringRef Constraint, 3391 MVT VT) const { 3392 3393 if (Constraint.size() == 1) { 3394 switch (Constraint[0]) { 3395 case 's': 3396 case 'r': 3397 switch (VT.getSizeInBits()) { 3398 default: 3399 return std::make_pair(0U, nullptr); 3400 case 32: 3401 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 3402 case 64: 3403 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 3404 case 128: 3405 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 3406 case 256: 3407 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 3408 } 3409 3410 case 'v': 3411 switch (VT.getSizeInBits()) { 3412 default: 3413 return std::make_pair(0U, nullptr); 3414 case 32: 3415 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 3416 case 64: 3417 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 3418 case 96: 3419 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 3420 case 128: 3421 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 3422 case 256: 3423 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 3424 case 512: 3425 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 3426 } 3427 } 3428 } 3429 3430 if (Constraint.size() > 1) { 3431 const TargetRegisterClass *RC = nullptr; 3432 if (Constraint[1] == 'v') { 3433 RC = &AMDGPU::VGPR_32RegClass; 3434 } else if (Constraint[1] == 's') { 3435 RC = &AMDGPU::SGPR_32RegClass; 3436 } 3437 3438 if (RC) { 3439 uint32_t Idx; 3440 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 3441 if (!Failed && Idx < RC->getNumRegs()) 3442 return std::make_pair(RC->getRegister(Idx), RC); 3443 } 3444 } 3445 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 3446} 3447 3448SITargetLowering::ConstraintType 3449SITargetLowering::getConstraintType(StringRef Constraint) const { 3450 if (Constraint.size() == 1) { 3451 switch (Constraint[0]) { 3452 default: break; 3453 case 's': 3454 case 'v': 3455 return C_RegisterClass; 3456 } 3457 } 3458 return TargetLowering::getConstraintType(Constraint); 3459} 3460