1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for SI
12//
13//===----------------------------------------------------------------------===//
14
15#ifdef _MSC_VER
16// Provide M_PI.
17#define _USE_MATH_DEFINES
18#include <cmath>
19#endif
20
21#include "AMDGPU.h"
22#include "AMDGPUIntrinsicInfo.h"
23#include "AMDGPUSubtarget.h"
24#include "SIISelLowering.h"
25#include "SIInstrInfo.h"
26#include "SIMachineFunctionInfo.h"
27#include "SIRegisterInfo.h"
28#include "llvm/ADT/BitVector.h"
29#include "llvm/ADT/StringSwitch.h"
30#include "llvm/CodeGen/CallingConvLower.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/CodeGen/SelectionDAG.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/IR/Function.h"
36
37using namespace llvm;
38
39// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
40static cl::opt<bool> EnableAMDGPUFastFDIV(
41  "amdgpu-fast-fdiv",
42  cl::desc("Enable faster 2.5 ulp fdiv"),
43  cl::init(false));
44
45static unsigned findFirstFreeSGPR(CCState &CCInfo) {
46  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
47  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
48    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
49      return AMDGPU::SGPR0 + Reg;
50    }
51  }
52  llvm_unreachable("Cannot allocate sgpr");
53}
54
55SITargetLowering::SITargetLowering(const TargetMachine &TM,
56                                   const SISubtarget &STI)
57    : AMDGPUTargetLowering(TM, STI) {
58  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
59  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
60
61  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
62  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
63
64  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
65  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
66  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
67
68  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
69  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
70
71  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
72  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
73
74  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
75  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
76
77  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
78  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
79
80  computeRegisterProperties(STI.getRegisterInfo());
81
82  // We need to custom lower vector stores from local memory
83  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
84  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
85  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
86  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
87  setOperationAction(ISD::LOAD, MVT::i1, Custom);
88
89  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
90  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
91  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
92  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
93  setOperationAction(ISD::STORE, MVT::i1, Custom);
94
95  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
96  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
97  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
98  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
99
100  setOperationAction(ISD::SELECT, MVT::i1, Promote);
101  setOperationAction(ISD::SELECT, MVT::i64, Custom);
102  setOperationAction(ISD::SELECT, MVT::f64, Promote);
103  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
104
105  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
106  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
107  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
108  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
109  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
110
111  setOperationAction(ISD::SETCC, MVT::i1, Promote);
112  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
113  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
114
115  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
116  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
117
118  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
119  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
120  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
121  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
122  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
123  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
124  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
125
126  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
127  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
128  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
129
130  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
131  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
132  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
133  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
134  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
135  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
136
137  // We only support LOAD/STORE and vector manipulation ops for vectors
138  // with > 4 elements.
139  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
140    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
141      switch (Op) {
142      case ISD::LOAD:
143      case ISD::STORE:
144      case ISD::BUILD_VECTOR:
145      case ISD::BITCAST:
146      case ISD::EXTRACT_VECTOR_ELT:
147      case ISD::INSERT_VECTOR_ELT:
148      case ISD::INSERT_SUBVECTOR:
149      case ISD::EXTRACT_SUBVECTOR:
150      case ISD::SCALAR_TO_VECTOR:
151        break;
152      case ISD::CONCAT_VECTORS:
153        setOperationAction(Op, VT, Custom);
154        break;
155      default:
156        setOperationAction(Op, VT, Expand);
157        break;
158      }
159    }
160  }
161
162  // Most operations are naturally 32-bit vector operations. We only support
163  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
164  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
165    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
166    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
167
168    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
169    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
170
171    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
172    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
173
174    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
175    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
176  }
177
178  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
179  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
180  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
181  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
182
183  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
184  // and output demarshalling
185  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
186  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
187
188  // We can't return success/failure, only the old value,
189  // let LLVM add the comparison
190  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
191  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
192
193  if (getSubtarget()->hasFlatAddressSpace()) {
194    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
195    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
196  }
197
198  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
199  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
200
201  // On SI this is s_memtime and s_memrealtime on VI.
202  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
203  setOperationAction(ISD::TRAP, MVT::Other, Custom);
204
205  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
206  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
207
208  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
209    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
210    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
211    setOperationAction(ISD::FRINT, MVT::f64, Legal);
212  }
213
214  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
215
216  setOperationAction(ISD::FSIN, MVT::f32, Custom);
217  setOperationAction(ISD::FCOS, MVT::f32, Custom);
218  setOperationAction(ISD::FDIV, MVT::f32, Custom);
219  setOperationAction(ISD::FDIV, MVT::f64, Custom);
220
221  setTargetDAGCombine(ISD::FADD);
222  setTargetDAGCombine(ISD::FSUB);
223  setTargetDAGCombine(ISD::FMINNUM);
224  setTargetDAGCombine(ISD::FMAXNUM);
225  setTargetDAGCombine(ISD::SMIN);
226  setTargetDAGCombine(ISD::SMAX);
227  setTargetDAGCombine(ISD::UMIN);
228  setTargetDAGCombine(ISD::UMAX);
229  setTargetDAGCombine(ISD::SETCC);
230  setTargetDAGCombine(ISD::AND);
231  setTargetDAGCombine(ISD::OR);
232  setTargetDAGCombine(ISD::UINT_TO_FP);
233  setTargetDAGCombine(ISD::FCANONICALIZE);
234
235  // All memory operations. Some folding on the pointer operand is done to help
236  // matching the constant offsets in the addressing modes.
237  setTargetDAGCombine(ISD::LOAD);
238  setTargetDAGCombine(ISD::STORE);
239  setTargetDAGCombine(ISD::ATOMIC_LOAD);
240  setTargetDAGCombine(ISD::ATOMIC_STORE);
241  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
242  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
243  setTargetDAGCombine(ISD::ATOMIC_SWAP);
244  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
245  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
246  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
247  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
248  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
249  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
250  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
251  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
252  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
253  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
254
255  setSchedulingPreference(Sched::RegPressure);
256}
257
258const SISubtarget *SITargetLowering::getSubtarget() const {
259  return static_cast<const SISubtarget *>(Subtarget);
260}
261
262//===----------------------------------------------------------------------===//
263// TargetLowering queries
264//===----------------------------------------------------------------------===//
265
266bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
267                                          const CallInst &CI,
268                                          unsigned IntrID) const {
269  switch (IntrID) {
270  case Intrinsic::amdgcn_atomic_inc:
271  case Intrinsic::amdgcn_atomic_dec:
272    Info.opc = ISD::INTRINSIC_W_CHAIN;
273    Info.memVT = MVT::getVT(CI.getType());
274    Info.ptrVal = CI.getOperand(0);
275    Info.align = 0;
276    Info.vol = false;
277    Info.readMem = true;
278    Info.writeMem = true;
279    return true;
280  default:
281    return false;
282  }
283}
284
285bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
286                                          EVT) const {
287  // SI has some legal vector types, but no legal vector operations. Say no
288  // shuffles are legal in order to prefer scalarizing some vector operations.
289  return false;
290}
291
292bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
293  // Flat instructions do not have offsets, and only have the register
294  // address.
295  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
296}
297
298bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
299  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
300  // additionally can do r + r + i with addr64. 32-bit has more addressing
301  // mode options. Depending on the resource constant, it can also do
302  // (i64 r0) + (i32 r1) * (i14 i).
303  //
304  // Private arrays end up using a scratch buffer most of the time, so also
305  // assume those use MUBUF instructions. Scratch loads / stores are currently
306  // implemented as mubuf instructions with offen bit set, so slightly
307  // different than the normal addr64.
308  if (!isUInt<12>(AM.BaseOffs))
309    return false;
310
311  // FIXME: Since we can split immediate into soffset and immediate offset,
312  // would it make sense to allow any immediate?
313
314  switch (AM.Scale) {
315  case 0: // r + i or just i, depending on HasBaseReg.
316    return true;
317  case 1:
318    return true; // We have r + r or r + i.
319  case 2:
320    if (AM.HasBaseReg) {
321      // Reject 2 * r + r.
322      return false;
323    }
324
325    // Allow 2 * r as r + r
326    // Or  2 * r + i is allowed as r + r + i.
327    return true;
328  default: // Don't allow n * r
329    return false;
330  }
331}
332
333bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
334                                             const AddrMode &AM, Type *Ty,
335                                             unsigned AS) const {
336  // No global is ever allowed as a base.
337  if (AM.BaseGV)
338    return false;
339
340  switch (AS) {
341  case AMDGPUAS::GLOBAL_ADDRESS: {
342    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
343      // Assume the we will use FLAT for all global memory accesses
344      // on VI.
345      // FIXME: This assumption is currently wrong.  On VI we still use
346      // MUBUF instructions for the r + i addressing mode.  As currently
347      // implemented, the MUBUF instructions only work on buffer < 4GB.
348      // It may be possible to support > 4GB buffers with MUBUF instructions,
349      // by setting the stride value in the resource descriptor which would
350      // increase the size limit to (stride * 4GB).  However, this is risky,
351      // because it has never been validated.
352      return isLegalFlatAddressingMode(AM);
353    }
354
355    return isLegalMUBUFAddressingMode(AM);
356  }
357  case AMDGPUAS::CONSTANT_ADDRESS: {
358    // If the offset isn't a multiple of 4, it probably isn't going to be
359    // correctly aligned.
360    if (AM.BaseOffs % 4 != 0)
361      return isLegalMUBUFAddressingMode(AM);
362
363    // There are no SMRD extloads, so if we have to do a small type access we
364    // will use a MUBUF load.
365    // FIXME?: We also need to do this if unaligned, but we don't know the
366    // alignment here.
367    if (DL.getTypeStoreSize(Ty) < 4)
368      return isLegalMUBUFAddressingMode(AM);
369
370    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
371      // SMRD instructions have an 8-bit, dword offset on SI.
372      if (!isUInt<8>(AM.BaseOffs / 4))
373        return false;
374    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
375      // On CI+, this can also be a 32-bit literal constant offset. If it fits
376      // in 8-bits, it can use a smaller encoding.
377      if (!isUInt<32>(AM.BaseOffs / 4))
378        return false;
379    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
380      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
381      if (!isUInt<20>(AM.BaseOffs))
382        return false;
383    } else
384      llvm_unreachable("unhandled generation");
385
386    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
387      return true;
388
389    if (AM.Scale == 1 && AM.HasBaseReg)
390      return true;
391
392    return false;
393  }
394
395  case AMDGPUAS::PRIVATE_ADDRESS:
396    return isLegalMUBUFAddressingMode(AM);
397
398  case AMDGPUAS::LOCAL_ADDRESS:
399  case AMDGPUAS::REGION_ADDRESS: {
400    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
401    // field.
402    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
403    // an 8-bit dword offset but we don't know the alignment here.
404    if (!isUInt<16>(AM.BaseOffs))
405      return false;
406
407    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
408      return true;
409
410    if (AM.Scale == 1 && AM.HasBaseReg)
411      return true;
412
413    return false;
414  }
415  case AMDGPUAS::FLAT_ADDRESS:
416  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
417    // For an unknown address space, this usually means that this is for some
418    // reason being used for pure arithmetic, and not based on some addressing
419    // computation. We don't have instructions that compute pointers with any
420    // addressing modes, so treat them as having no offset like flat
421    // instructions.
422    return isLegalFlatAddressingMode(AM);
423
424  default:
425    llvm_unreachable("unhandled address space");
426  }
427}
428
429bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
430                                                      unsigned AddrSpace,
431                                                      unsigned Align,
432                                                      bool *IsFast) const {
433  if (IsFast)
434    *IsFast = false;
435
436  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
437  // which isn't a simple VT.
438  if (!VT.isSimple() || VT == MVT::Other)
439    return false;
440
441  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
442      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
443    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
444    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
445    // with adjacent offsets.
446    bool AlignedBy4 = (Align % 4 == 0);
447    if (IsFast)
448      *IsFast = AlignedBy4;
449
450    return AlignedBy4;
451  }
452
453  if (Subtarget->hasUnalignedBufferAccess()) {
454    // If we have an uniform constant load, it still requires using a slow
455    // buffer instruction if unaligned.
456    if (IsFast) {
457      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
458        (Align % 4 == 0) : true;
459    }
460
461    return true;
462  }
463
464  // Smaller than dword value must be aligned.
465  if (VT.bitsLT(MVT::i32))
466    return false;
467
468  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
469  // byte-address are ignored, thus forcing Dword alignment.
470  // This applies to private, global, and constant memory.
471  if (IsFast)
472    *IsFast = true;
473
474  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
475}
476
477EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
478                                          unsigned SrcAlign, bool IsMemset,
479                                          bool ZeroMemset,
480                                          bool MemcpyStrSrc,
481                                          MachineFunction &MF) const {
482  // FIXME: Should account for address space here.
483
484  // The default fallback uses the private pointer size as a guess for a type to
485  // use. Make sure we switch these to 64-bit accesses.
486
487  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
488    return MVT::v4i32;
489
490  if (Size >= 8 && DstAlign >= 4)
491    return MVT::v2i32;
492
493  // Use the default.
494  return MVT::Other;
495}
496
497static bool isFlatGlobalAddrSpace(unsigned AS) {
498  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
499    AS == AMDGPUAS::FLAT_ADDRESS ||
500    AS == AMDGPUAS::CONSTANT_ADDRESS;
501}
502
503bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
504                                           unsigned DestAS) const {
505  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
506}
507
508bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
509  const MemSDNode *MemNode = cast<MemSDNode>(N);
510  const Value *Ptr = MemNode->getMemOperand()->getValue();
511
512  // UndefValue means this is a load of a kernel input.  These are uniform.
513  // Sometimes LDS instructions have constant pointers.
514  // If Ptr is null, then that means this mem operand contains a
515  // PseudoSourceValue like GOT.
516  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
517      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
518    return true;
519
520  const Instruction *I = dyn_cast<Instruction>(Ptr);
521  return I && I->getMetadata("amdgpu.uniform");
522}
523
524TargetLoweringBase::LegalizeTypeAction
525SITargetLowering::getPreferredVectorAction(EVT VT) const {
526  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
527    return TypeSplitVector;
528
529  return TargetLoweringBase::getPreferredVectorAction(VT);
530}
531
532bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
533                                                         Type *Ty) const {
534  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
535  return TII->isInlineConstant(Imm);
536}
537
538bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
539
540  // SimplifySetCC uses this function to determine whether or not it should
541  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
542  if (VT == MVT::i1 && Op == ISD::SETCC)
543    return false;
544
545  return TargetLowering::isTypeDesirableForOp(Op, VT);
546}
547
548SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
549                                            const SDLoc &SL, SDValue Chain,
550                                            unsigned Offset) const {
551  const DataLayout &DL = DAG.getDataLayout();
552  MachineFunction &MF = DAG.getMachineFunction();
553  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
554  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
555
556  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
557  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
558  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
559                                       MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
560  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
561                     DAG.getConstant(Offset, SL, PtrVT));
562}
563SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
564                                         const SDLoc &SL, SDValue Chain,
565                                         unsigned Offset, bool Signed) const {
566  const DataLayout &DL = DAG.getDataLayout();
567  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
568  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
569  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
570  SDValue PtrOffset = DAG.getUNDEF(PtrVT);
571  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
572
573  unsigned Align = DL.getABITypeAlignment(Ty);
574
575  ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
576  if (MemVT.isFloatingPoint())
577    ExtTy = ISD::EXTLOAD;
578
579  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
580  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
581                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
582                     false, // isVolatile
583                     true, // isNonTemporal
584                     true, // isInvariant
585                     Align); // Alignment
586}
587
588SDValue SITargetLowering::LowerFormalArguments(
589    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
590    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
591    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
592  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
593
594  MachineFunction &MF = DAG.getMachineFunction();
595  FunctionType *FType = MF.getFunction()->getFunctionType();
596  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
597  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
598
599  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
600    const Function *Fn = MF.getFunction();
601    DiagnosticInfoUnsupported NoGraphicsHSA(
602        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
603    DAG.getContext()->diagnose(NoGraphicsHSA);
604    return DAG.getEntryNode();
605  }
606
607  // Create stack objects that are used for emitting debugger prologue if
608  // "amdgpu-debugger-emit-prologue" attribute was specified.
609  if (ST.debuggerEmitPrologue())
610    createDebuggerPrologueStackObjects(MF);
611
612  SmallVector<ISD::InputArg, 16> Splits;
613  BitVector Skipped(Ins.size());
614
615  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
616    const ISD::InputArg &Arg = Ins[i];
617
618    // First check if it's a PS input addr
619    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
620        !Arg.Flags.isByVal() && PSInputNum <= 15) {
621
622      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
623        // We can safely skip PS inputs
624        Skipped.set(i);
625        ++PSInputNum;
626        continue;
627      }
628
629      Info->markPSInputAllocated(PSInputNum);
630      if (Arg.Used)
631        Info->PSInputEna |= 1 << PSInputNum;
632
633      ++PSInputNum;
634    }
635
636    if (AMDGPU::isShader(CallConv)) {
637      // Second split vertices into their elements
638      if (Arg.VT.isVector()) {
639        ISD::InputArg NewArg = Arg;
640        NewArg.Flags.setSplit();
641        NewArg.VT = Arg.VT.getVectorElementType();
642
643        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
644        // three or five element vertex only needs three or five registers,
645        // NOT four or eight.
646        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
647        unsigned NumElements = ParamType->getVectorNumElements();
648
649        for (unsigned j = 0; j != NumElements; ++j) {
650          Splits.push_back(NewArg);
651          NewArg.PartOffset += NewArg.VT.getStoreSize();
652        }
653      } else {
654        Splits.push_back(Arg);
655      }
656    }
657  }
658
659  SmallVector<CCValAssign, 16> ArgLocs;
660  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
661                 *DAG.getContext());
662
663  // At least one interpolation mode must be enabled or else the GPU will hang.
664  //
665  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
666  // PSInputAddr, the user wants to enable some bits after the compilation
667  // based on run-time states. Since we can't know what the final PSInputEna
668  // will look like, so we shouldn't do anything here and the user should take
669  // responsibility for the correct programming.
670  //
671  // Otherwise, the following restrictions apply:
672  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
673  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
674  //   enabled too.
675  if (CallConv == CallingConv::AMDGPU_PS &&
676      ((Info->getPSInputAddr() & 0x7F) == 0 ||
677       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
678    CCInfo.AllocateReg(AMDGPU::VGPR0);
679    CCInfo.AllocateReg(AMDGPU::VGPR1);
680    Info->markPSInputAllocated(0);
681    Info->PSInputEna |= 1;
682  }
683
684  if (!AMDGPU::isShader(CallConv)) {
685    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
686                            Splits);
687
688    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
689  } else {
690    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
691           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
692           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
693           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
694           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
695           !Info->hasWorkItemIDZ());
696  }
697
698  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
699  if (Info->hasPrivateSegmentBuffer()) {
700    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
701    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
702    CCInfo.AllocateReg(PrivateSegmentBufferReg);
703  }
704
705  if (Info->hasDispatchPtr()) {
706    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
707    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
708    CCInfo.AllocateReg(DispatchPtrReg);
709  }
710
711  if (Info->hasQueuePtr()) {
712    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
713    MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
714    CCInfo.AllocateReg(QueuePtrReg);
715  }
716
717  if (Info->hasKernargSegmentPtr()) {
718    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
719    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
720    CCInfo.AllocateReg(InputPtrReg);
721  }
722
723  if (Info->hasFlatScratchInit()) {
724    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
725    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
726    CCInfo.AllocateReg(FlatScratchInitReg);
727  }
728
729  AnalyzeFormalArguments(CCInfo, Splits);
730
731  SmallVector<SDValue, 16> Chains;
732
733  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
734
735    const ISD::InputArg &Arg = Ins[i];
736    if (Skipped[i]) {
737      InVals.push_back(DAG.getUNDEF(Arg.VT));
738      continue;
739    }
740
741    CCValAssign &VA = ArgLocs[ArgIdx++];
742    MVT VT = VA.getLocVT();
743
744    if (VA.isMemLoc()) {
745      VT = Ins[i].VT;
746      EVT MemVT = Splits[i].VT;
747      const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
748                              VA.getLocMemOffset();
749      // The first 36 bytes of the input buffer contains information about
750      // thread group and global sizes.
751      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
752                                   Offset, Ins[i].Flags.isSExt());
753      Chains.push_back(Arg.getValue(1));
754
755      auto *ParamTy =
756        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
757      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
758          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
759        // On SI local pointers are just offsets into LDS, so they are always
760        // less than 16-bits.  On CI and newer they could potentially be
761        // real pointers, so we can't guarantee their size.
762        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
763                          DAG.getValueType(MVT::i16));
764      }
765
766      InVals.push_back(Arg);
767      Info->ABIArgOffset = Offset + MemVT.getStoreSize();
768      continue;
769    }
770    assert(VA.isRegLoc() && "Parameter must be in a register!");
771
772    unsigned Reg = VA.getLocReg();
773
774    if (VT == MVT::i64) {
775      // For now assume it is a pointer
776      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
777                                     &AMDGPU::SReg_64RegClass);
778      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
779      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
780      InVals.push_back(Copy);
781      continue;
782    }
783
784    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
785
786    Reg = MF.addLiveIn(Reg, RC);
787    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
788
789    if (Arg.VT.isVector()) {
790
791      // Build a vector from the registers
792      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
793      unsigned NumElements = ParamType->getVectorNumElements();
794
795      SmallVector<SDValue, 4> Regs;
796      Regs.push_back(Val);
797      for (unsigned j = 1; j != NumElements; ++j) {
798        Reg = ArgLocs[ArgIdx++].getLocReg();
799        Reg = MF.addLiveIn(Reg, RC);
800
801        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
802        Regs.push_back(Copy);
803      }
804
805      // Fill up the missing vector elements
806      NumElements = Arg.VT.getVectorNumElements() - NumElements;
807      Regs.append(NumElements, DAG.getUNDEF(VT));
808
809      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
810      continue;
811    }
812
813    InVals.push_back(Val);
814  }
815
816  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
817  // these from the dispatch pointer.
818
819  // Start adding system SGPRs.
820  if (Info->hasWorkGroupIDX()) {
821    unsigned Reg = Info->addWorkGroupIDX();
822    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
823    CCInfo.AllocateReg(Reg);
824  }
825
826  if (Info->hasWorkGroupIDY()) {
827    unsigned Reg = Info->addWorkGroupIDY();
828    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
829    CCInfo.AllocateReg(Reg);
830  }
831
832  if (Info->hasWorkGroupIDZ()) {
833    unsigned Reg = Info->addWorkGroupIDZ();
834    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
835    CCInfo.AllocateReg(Reg);
836  }
837
838  if (Info->hasWorkGroupInfo()) {
839    unsigned Reg = Info->addWorkGroupInfo();
840    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
841    CCInfo.AllocateReg(Reg);
842  }
843
844  if (Info->hasPrivateSegmentWaveByteOffset()) {
845    // Scratch wave offset passed in system SGPR.
846    unsigned PrivateSegmentWaveByteOffsetReg;
847
848    if (AMDGPU::isShader(CallConv)) {
849      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
850      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
851    } else
852      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
853
854    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
855    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
856  }
857
858  // Now that we've figured out where the scratch register inputs are, see if
859  // should reserve the arguments and use them directly.
860  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
861  // Record that we know we have non-spill stack objects so we don't need to
862  // check all stack objects later.
863  if (HasStackObjects)
864    Info->setHasNonSpillStackObjects(true);
865
866  if (ST.isAmdHsaOS()) {
867    // TODO: Assume we will spill without optimizations.
868    if (HasStackObjects) {
869      // If we have stack objects, we unquestionably need the private buffer
870      // resource. For the HSA ABI, this will be the first 4 user SGPR
871      // inputs. We can reserve those and use them directly.
872
873      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
874        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
875      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
876
877      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
878        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
879      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
880    } else {
881      unsigned ReservedBufferReg
882        = TRI->reservedPrivateSegmentBufferReg(MF);
883      unsigned ReservedOffsetReg
884        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
885
886      // We tentatively reserve the last registers (skipping the last two
887      // which may contain VCC). After register allocation, we'll replace
888      // these with the ones immediately after those which were really
889      // allocated. In the prologue copies will be inserted from the argument
890      // to these reserved registers.
891      Info->setScratchRSrcReg(ReservedBufferReg);
892      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
893    }
894  } else {
895    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
896
897    // Without HSA, relocations are used for the scratch pointer and the
898    // buffer resource setup is always inserted in the prologue. Scratch wave
899    // offset is still in an input SGPR.
900    Info->setScratchRSrcReg(ReservedBufferReg);
901
902    if (HasStackObjects) {
903      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
904        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
905      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
906    } else {
907      unsigned ReservedOffsetReg
908        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
909      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
910    }
911  }
912
913  if (Info->hasWorkItemIDX()) {
914    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
915    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
916    CCInfo.AllocateReg(Reg);
917  }
918
919  if (Info->hasWorkItemIDY()) {
920    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
921    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
922    CCInfo.AllocateReg(Reg);
923  }
924
925  if (Info->hasWorkItemIDZ()) {
926    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
927    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
928    CCInfo.AllocateReg(Reg);
929  }
930
931  if (Chains.empty())
932    return Chain;
933
934  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
935}
936
937SDValue
938SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
939                              bool isVarArg,
940                              const SmallVectorImpl<ISD::OutputArg> &Outs,
941                              const SmallVectorImpl<SDValue> &OutVals,
942                              const SDLoc &DL, SelectionDAG &DAG) const {
943  MachineFunction &MF = DAG.getMachineFunction();
944  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
945
946  if (!AMDGPU::isShader(CallConv))
947    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
948                                             OutVals, DL, DAG);
949
950  Info->setIfReturnsVoid(Outs.size() == 0);
951
952  SmallVector<ISD::OutputArg, 48> Splits;
953  SmallVector<SDValue, 48> SplitVals;
954
955  // Split vectors into their elements.
956  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
957    const ISD::OutputArg &Out = Outs[i];
958
959    if (Out.VT.isVector()) {
960      MVT VT = Out.VT.getVectorElementType();
961      ISD::OutputArg NewOut = Out;
962      NewOut.Flags.setSplit();
963      NewOut.VT = VT;
964
965      // We want the original number of vector elements here, e.g.
966      // three or five, not four or eight.
967      unsigned NumElements = Out.ArgVT.getVectorNumElements();
968
969      for (unsigned j = 0; j != NumElements; ++j) {
970        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
971                                   DAG.getConstant(j, DL, MVT::i32));
972        SplitVals.push_back(Elem);
973        Splits.push_back(NewOut);
974        NewOut.PartOffset += NewOut.VT.getStoreSize();
975      }
976    } else {
977      SplitVals.push_back(OutVals[i]);
978      Splits.push_back(Out);
979    }
980  }
981
982  // CCValAssign - represent the assignment of the return value to a location.
983  SmallVector<CCValAssign, 48> RVLocs;
984
985  // CCState - Info about the registers and stack slots.
986  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
987                 *DAG.getContext());
988
989  // Analyze outgoing return values.
990  AnalyzeReturn(CCInfo, Splits);
991
992  SDValue Flag;
993  SmallVector<SDValue, 48> RetOps;
994  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
995
996  // Copy the result values into the output registers.
997  for (unsigned i = 0, realRVLocIdx = 0;
998       i != RVLocs.size();
999       ++i, ++realRVLocIdx) {
1000    CCValAssign &VA = RVLocs[i];
1001    assert(VA.isRegLoc() && "Can only return in registers!");
1002
1003    SDValue Arg = SplitVals[realRVLocIdx];
1004
1005    // Copied from other backends.
1006    switch (VA.getLocInfo()) {
1007    default: llvm_unreachable("Unknown loc info!");
1008    case CCValAssign::Full:
1009      break;
1010    case CCValAssign::BCvt:
1011      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1012      break;
1013    }
1014
1015    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1016    Flag = Chain.getValue(1);
1017    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1018  }
1019
1020  // Update chain and glue.
1021  RetOps[0] = Chain;
1022  if (Flag.getNode())
1023    RetOps.push_back(Flag);
1024
1025  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
1026  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1027}
1028
1029unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
1030                                             SelectionDAG &DAG) const {
1031  unsigned Reg = StringSwitch<unsigned>(RegName)
1032    .Case("m0", AMDGPU::M0)
1033    .Case("exec", AMDGPU::EXEC)
1034    .Case("exec_lo", AMDGPU::EXEC_LO)
1035    .Case("exec_hi", AMDGPU::EXEC_HI)
1036    .Case("flat_scratch", AMDGPU::FLAT_SCR)
1037    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
1038    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
1039    .Default(AMDGPU::NoRegister);
1040
1041  if (Reg == AMDGPU::NoRegister) {
1042    report_fatal_error(Twine("invalid register name \""
1043                             + StringRef(RegName)  + "\"."));
1044
1045  }
1046
1047  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
1048      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
1049    report_fatal_error(Twine("invalid register \""
1050                             + StringRef(RegName)  + "\" for subtarget."));
1051  }
1052
1053  switch (Reg) {
1054  case AMDGPU::M0:
1055  case AMDGPU::EXEC_LO:
1056  case AMDGPU::EXEC_HI:
1057  case AMDGPU::FLAT_SCR_LO:
1058  case AMDGPU::FLAT_SCR_HI:
1059    if (VT.getSizeInBits() == 32)
1060      return Reg;
1061    break;
1062  case AMDGPU::EXEC:
1063  case AMDGPU::FLAT_SCR:
1064    if (VT.getSizeInBits() == 64)
1065      return Reg;
1066    break;
1067  default:
1068    llvm_unreachable("missing register type checking");
1069  }
1070
1071  report_fatal_error(Twine("invalid type for register \""
1072                           + StringRef(RegName) + "\"."));
1073}
1074
1075// If kill is not the last instruction, split the block so kill is always a
1076// proper terminator.
1077MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
1078                                                    MachineBasicBlock *BB) const {
1079  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1080
1081  MachineBasicBlock::iterator SplitPoint(&MI);
1082  ++SplitPoint;
1083
1084  if (SplitPoint == BB->end()) {
1085    // Don't bother with a new block.
1086    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1087    return BB;
1088  }
1089
1090  MachineFunction *MF = BB->getParent();
1091  MachineBasicBlock *SplitBB
1092    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
1093
1094  SmallSet<unsigned, 8> SplitDefRegs;
1095  for (auto I = SplitPoint, E = BB->end(); I != E; ++I) {
1096    for (MachineOperand &Def : I->defs())
1097      SplitDefRegs.insert(Def.getReg());
1098  }
1099
1100  // Fix the block phi references to point to the new block for the defs in the
1101  // second piece of the block.
1102  for (MachineBasicBlock *Succ : BB->successors()) {
1103    for (MachineInstr &MI : *Succ) {
1104      if (!MI.isPHI())
1105        break;
1106
1107      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1108        unsigned IncomingReg = MI.getOperand(I).getReg();
1109        MachineOperand &FromBB = MI.getOperand(I + 1);
1110        if (BB == FromBB.getMBB()) {
1111          if (SplitDefRegs.count(IncomingReg))
1112            FromBB.setMBB(SplitBB);
1113
1114          break;
1115        }
1116      }
1117    }
1118  }
1119
1120  MF->insert(++MachineFunction::iterator(BB), SplitBB);
1121  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
1122
1123
1124  SplitBB->transferSuccessors(BB);
1125  BB->addSuccessor(SplitBB);
1126
1127  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1128  return SplitBB;
1129}
1130
1131MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
1132  MachineInstr &MI, MachineBasicBlock *BB) const {
1133  switch (MI.getOpcode()) {
1134  case AMDGPU::SI_INIT_M0: {
1135    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1136    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
1137            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1138        .addOperand(MI.getOperand(0));
1139    MI.eraseFromParent();
1140    break;
1141  }
1142  case AMDGPU::BRANCH:
1143    return BB;
1144  case AMDGPU::GET_GROUPSTATICSIZE: {
1145    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1146
1147    MachineFunction *MF = BB->getParent();
1148    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1149    DebugLoc DL = MI.getDebugLoc();
1150    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
1151        .addOperand(MI.getOperand(0))
1152        .addImm(MFI->LDSSize);
1153    MI.eraseFromParent();
1154    return BB;
1155  }
1156  case AMDGPU::SI_KILL:
1157    return splitKillBlock(MI, BB);
1158  default:
1159    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
1160  }
1161  return BB;
1162}
1163
1164bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1165  // This currently forces unfolding various combinations of fsub into fma with
1166  // free fneg'd operands. As long as we have fast FMA (controlled by
1167  // isFMAFasterThanFMulAndFAdd), we should perform these.
1168
1169  // When fma is quarter rate, for f64 where add / sub are at best half rate,
1170  // most of these combines appear to be cycle neutral but save on instruction
1171  // count / code size.
1172  return true;
1173}
1174
1175EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
1176                                         EVT VT) const {
1177  if (!VT.isVector()) {
1178    return MVT::i1;
1179  }
1180  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
1181}
1182
1183MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
1184  return MVT::i32;
1185}
1186
1187// Answering this is somewhat tricky and depends on the specific device which
1188// have different rates for fma or all f64 operations.
1189//
1190// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
1191// regardless of which device (although the number of cycles differs between
1192// devices), so it is always profitable for f64.
1193//
1194// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
1195// only on full rate devices. Normally, we should prefer selecting v_mad_f32
1196// which we can always do even without fused FP ops since it returns the same
1197// result as the separate operations and since it is always full
1198// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
1199// however does not support denormals, so we do report fma as faster if we have
1200// a fast fma device and require denormals.
1201//
1202bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
1203  VT = VT.getScalarType();
1204
1205  if (!VT.isSimple())
1206    return false;
1207
1208  switch (VT.getSimpleVT().SimpleTy) {
1209  case MVT::f32:
1210    // This is as fast on some subtargets. However, we always have full rate f32
1211    // mad available which returns the same result as the separate operations
1212    // which we should prefer over fma. We can't use this if we want to support
1213    // denormals, so only report this in these cases.
1214    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
1215  case MVT::f64:
1216    return true;
1217  default:
1218    break;
1219  }
1220
1221  return false;
1222}
1223
1224//===----------------------------------------------------------------------===//
1225// Custom DAG Lowering Operations
1226//===----------------------------------------------------------------------===//
1227
1228SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1229  switch (Op.getOpcode()) {
1230  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1231  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
1232  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1233  case ISD::LOAD: {
1234    SDValue Result = LowerLOAD(Op, DAG);
1235    assert((!Result.getNode() ||
1236            Result.getNode()->getNumValues() == 2) &&
1237           "Load should return a value and a chain");
1238    return Result;
1239  }
1240
1241  case ISD::FSIN:
1242  case ISD::FCOS:
1243    return LowerTrig(Op, DAG);
1244  case ISD::SELECT: return LowerSELECT(Op, DAG);
1245  case ISD::FDIV: return LowerFDIV(Op, DAG);
1246  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
1247  case ISD::STORE: return LowerSTORE(Op, DAG);
1248  case ISD::GlobalAddress: {
1249    MachineFunction &MF = DAG.getMachineFunction();
1250    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1251    return LowerGlobalAddress(MFI, Op, DAG);
1252  }
1253  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1254  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
1255  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
1256  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
1257  case ISD::TRAP: return lowerTRAP(Op, DAG);
1258  }
1259  return SDValue();
1260}
1261
1262/// \brief Helper function for LowerBRCOND
1263static SDNode *findUser(SDValue Value, unsigned Opcode) {
1264
1265  SDNode *Parent = Value.getNode();
1266  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
1267       I != E; ++I) {
1268
1269    if (I.getUse().get() != Value)
1270      continue;
1271
1272    if (I->getOpcode() == Opcode)
1273      return *I;
1274  }
1275  return nullptr;
1276}
1277
1278SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
1279
1280  SDLoc SL(Op);
1281  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
1282  unsigned FrameIndex = FINode->getIndex();
1283
1284  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
1285  // high bit of a frame index offset were to be set, this would mean that it
1286  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
1287  // buffer, with 64 being the number of threads per wave.
1288  //
1289  // The maximum private allocation for the entire GPU is 4G, and we are
1290  // concerned with the largest the index could ever be for an individual
1291  // workitem. This will occur with the minmum dispatch size. If a program
1292  // requires more, the dispatch size will be reduced.
1293  //
1294  // With this limit, we can mark the high bit of the FrameIndex node as known
1295  // zero, which is important, because it means in most situations we can prove
1296  // that values derived from FrameIndex nodes are non-negative. This enables us
1297  // to take advantage of more addressing modes when accessing scratch buffers,
1298  // since for scratch reads/writes, the register offset must always be
1299  // positive.
1300
1301  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
1302
1303  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
1304  // granularity. It is probably a full wave.
1305  uint64_t MinGranularity = 32;
1306
1307  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
1308  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
1309
1310  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
1311  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
1312                     DAG.getValueType(ExtVT));
1313}
1314
1315bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
1316  if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
1317    return false;
1318
1319  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
1320  default: return false;
1321  case AMDGPUIntrinsic::amdgcn_if:
1322  case AMDGPUIntrinsic::amdgcn_else:
1323  case AMDGPUIntrinsic::amdgcn_break:
1324  case AMDGPUIntrinsic::amdgcn_if_break:
1325  case AMDGPUIntrinsic::amdgcn_else_break:
1326  case AMDGPUIntrinsic::amdgcn_loop:
1327  case AMDGPUIntrinsic::amdgcn_end_cf:
1328    return true;
1329  }
1330}
1331
1332void SITargetLowering::createDebuggerPrologueStackObjects(
1333    MachineFunction &MF) const {
1334  // Create stack objects that are used for emitting debugger prologue.
1335  //
1336  // Debugger prologue writes work group IDs and work item IDs to scratch memory
1337  // at fixed location in the following format:
1338  //   offset 0:  work group ID x
1339  //   offset 4:  work group ID y
1340  //   offset 8:  work group ID z
1341  //   offset 16: work item ID x
1342  //   offset 20: work item ID y
1343  //   offset 24: work item ID z
1344  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1345  int ObjectIdx = 0;
1346
1347  // For each dimension:
1348  for (unsigned i = 0; i < 3; ++i) {
1349    // Create fixed stack object for work group ID.
1350    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
1351    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
1352    // Create fixed stack object for work item ID.
1353    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
1354    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
1355  }
1356}
1357
1358/// This transforms the control flow intrinsics to get the branch destination as
1359/// last parameter, also switches branch target with BR if the need arise
1360SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
1361                                      SelectionDAG &DAG) const {
1362
1363  SDLoc DL(BRCOND);
1364
1365  SDNode *Intr = BRCOND.getOperand(1).getNode();
1366  SDValue Target = BRCOND.getOperand(2);
1367  SDNode *BR = nullptr;
1368  SDNode *SetCC = nullptr;
1369
1370  if (Intr->getOpcode() == ISD::SETCC) {
1371    // As long as we negate the condition everything is fine
1372    SetCC = Intr;
1373    Intr = SetCC->getOperand(0).getNode();
1374
1375  } else {
1376    // Get the target from BR if we don't negate the condition
1377    BR = findUser(BRCOND, ISD::BR);
1378    Target = BR->getOperand(1);
1379  }
1380
1381  if (!isCFIntrinsic(Intr)) {
1382    // This is a uniform branch so we don't need to legalize.
1383    return BRCOND;
1384  }
1385
1386  assert(!SetCC ||
1387        (SetCC->getConstantOperandVal(1) == 1 &&
1388         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
1389                                                             ISD::SETNE));
1390
1391  // Build the result and
1392  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
1393
1394  // operands of the new intrinsic call
1395  SmallVector<SDValue, 4> Ops;
1396  Ops.push_back(BRCOND.getOperand(0));
1397  Ops.append(Intr->op_begin() + 1, Intr->op_end());
1398  Ops.push_back(Target);
1399
1400  // build the new intrinsic call
1401  SDNode *Result = DAG.getNode(
1402    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
1403    DAG.getVTList(Res), Ops).getNode();
1404
1405  if (BR) {
1406    // Give the branch instruction our target
1407    SDValue Ops[] = {
1408      BR->getOperand(0),
1409      BRCOND.getOperand(2)
1410    };
1411    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
1412    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
1413    BR = NewBR.getNode();
1414  }
1415
1416  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
1417
1418  // Copy the intrinsic results to registers
1419  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
1420    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
1421    if (!CopyToReg)
1422      continue;
1423
1424    Chain = DAG.getCopyToReg(
1425      Chain, DL,
1426      CopyToReg->getOperand(1),
1427      SDValue(Result, i - 1),
1428      SDValue());
1429
1430    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
1431  }
1432
1433  // Remove the old intrinsic from the chain
1434  DAG.ReplaceAllUsesOfValueWith(
1435    SDValue(Intr, Intr->getNumValues() - 1),
1436    Intr->getOperand(0));
1437
1438  return Chain;
1439}
1440
1441SDValue SITargetLowering::getSegmentAperture(unsigned AS,
1442                                             SelectionDAG &DAG) const {
1443  SDLoc SL;
1444  MachineFunction &MF = DAG.getMachineFunction();
1445  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1446  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
1447  assert(UserSGPR != AMDGPU::NoRegister);
1448
1449  SDValue QueuePtr = CreateLiveInRegister(
1450    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
1451
1452  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1453  // private_segment_aperture_base_hi.
1454  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1455
1456  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
1457                            DAG.getConstant(StructOffset, SL, MVT::i64));
1458
1459  // TODO: Use custom target PseudoSourceValue.
1460  // TODO: We should use the value from the IR intrinsic call, but it might not
1461  // be available and how do we get it?
1462  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
1463                                              AMDGPUAS::CONSTANT_ADDRESS));
1464
1465  MachinePointerInfo PtrInfo(V, StructOffset);
1466  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr,
1467                     PtrInfo, false,
1468                     false, true,
1469                     MinAlign(64, StructOffset));
1470}
1471
1472SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
1473                                             SelectionDAG &DAG) const {
1474  SDLoc SL(Op);
1475  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
1476
1477  SDValue Src = ASC->getOperand(0);
1478
1479  // FIXME: Really support non-0 null pointers.
1480  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
1481  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
1482
1483  // flat -> local/private
1484  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1485    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1486        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1487      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
1488      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
1489
1490      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
1491                         NonNull, Ptr, SegmentNullPtr);
1492    }
1493  }
1494
1495  // local/private -> flat
1496  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1497    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1498        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1499      SDValue NonNull
1500        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
1501
1502      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
1503      SDValue CvtPtr
1504        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
1505
1506      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
1507                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
1508                         FlatNullPtr);
1509    }
1510  }
1511
1512  // global <-> flat are no-ops and never emitted.
1513
1514  const MachineFunction &MF = DAG.getMachineFunction();
1515  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1516    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
1517  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
1518
1519  return DAG.getUNDEF(ASC->getValueType(0));
1520}
1521
1522static bool shouldEmitGOTReloc(const GlobalValue *GV,
1523                               const TargetMachine &TM) {
1524  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1525         !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
1526}
1527
1528bool
1529SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
1530  // We can fold offsets for anything that doesn't require a GOT relocation.
1531  return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1532         !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
1533}
1534
1535static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
1536                                      SDLoc DL, unsigned Offset, EVT PtrVT,
1537                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
1538  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
1539  // lowered to the following code sequence:
1540  // s_getpc_b64 s[0:1]
1541  // s_add_u32 s0, s0, $symbol
1542  // s_addc_u32 s1, s1, 0
1543  //
1544  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1545  // a fixup or relocation is emitted to replace $symbol with a literal
1546  // constant, which is a pc-relative offset from the encoding of the $symbol
1547  // operand to the global variable.
1548  //
1549  // What we want here is an offset from the value returned by s_getpc
1550  // (which is the address of the s_add_u32 instruction) to the global
1551  // variable, but since the encoding of $symbol starts 4 bytes after the start
1552  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1553  // small. This requires us to add 4 to the global variable offset in order to
1554  // compute the correct address.
1555  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
1556                                          GAFlags);
1557  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
1558}
1559
1560SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
1561                                             SDValue Op,
1562                                             SelectionDAG &DAG) const {
1563  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
1564
1565  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
1566      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
1567    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
1568
1569  SDLoc DL(GSD);
1570  const GlobalValue *GV = GSD->getGlobal();
1571  EVT PtrVT = Op.getValueType();
1572
1573  if (!shouldEmitGOTReloc(GV, getTargetMachine()))
1574    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
1575
1576  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
1577                                            SIInstrInfo::MO_GOTPCREL);
1578
1579  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
1580  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1581  const DataLayout &DataLayout = DAG.getDataLayout();
1582  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
1583  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
1584  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1585
1586  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr,
1587                     PtrInfo, false, false, true, Align);
1588}
1589
1590SDValue SITargetLowering::lowerTRAP(SDValue Op,
1591                                    SelectionDAG &DAG) const {
1592  const MachineFunction &MF = DAG.getMachineFunction();
1593  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
1594                                   "trap handler not supported",
1595                                   Op.getDebugLoc(),
1596                                   DS_Warning);
1597  DAG.getContext()->diagnose(NoTrap);
1598
1599  // Emit s_endpgm.
1600
1601  // FIXME: This should really be selected to s_trap, but that requires
1602  // setting up the trap handler for it o do anything.
1603  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
1604                     Op.getOperand(0));
1605}
1606
1607SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
1608                                   const SDLoc &DL, SDValue V) const {
1609  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
1610  // the destination register.
1611  //
1612  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
1613  // so we will end up with redundant moves to m0.
1614  //
1615  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
1616
1617  // A Null SDValue creates a glue result.
1618  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
1619                                  V, Chain);
1620  return SDValue(M0, 0);
1621}
1622
1623SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
1624                                                 SDValue Op,
1625                                                 MVT VT,
1626                                                 unsigned Offset) const {
1627  SDLoc SL(Op);
1628  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
1629                                 DAG.getEntryNode(), Offset, false);
1630  // The local size values will have the hi 16-bits as zero.
1631  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
1632                     DAG.getValueType(VT));
1633}
1634
1635static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
1636  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
1637                                      "non-hsa intrinsic with hsa target",
1638                                      DL.getDebugLoc());
1639  DAG.getContext()->diagnose(BadIntrin);
1640  return DAG.getUNDEF(VT);
1641}
1642
1643static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
1644  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
1645                                      "intrinsic not supported on subtarget",
1646                                      DL.getDebugLoc());
1647  DAG.getContext()->diagnose(BadIntrin);
1648  return DAG.getUNDEF(VT);
1649}
1650
1651SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1652                                                  SelectionDAG &DAG) const {
1653  MachineFunction &MF = DAG.getMachineFunction();
1654  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
1655  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1656
1657  EVT VT = Op.getValueType();
1658  SDLoc DL(Op);
1659  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1660
1661  // TODO: Should this propagate fast-math-flags?
1662
1663  switch (IntrinsicID) {
1664  case Intrinsic::amdgcn_dispatch_ptr:
1665  case Intrinsic::amdgcn_queue_ptr: {
1666    if (!Subtarget->isAmdHsaOS()) {
1667      DiagnosticInfoUnsupported BadIntrin(
1668          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
1669          DL.getDebugLoc());
1670      DAG.getContext()->diagnose(BadIntrin);
1671      return DAG.getUNDEF(VT);
1672    }
1673
1674    auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
1675      SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
1676    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
1677                                TRI->getPreloadedValue(MF, Reg), VT);
1678  }
1679  case Intrinsic::amdgcn_implicitarg_ptr: {
1680    unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
1681    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
1682  }
1683  case Intrinsic::amdgcn_kernarg_segment_ptr: {
1684    unsigned Reg
1685      = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
1686    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
1687  }
1688  case Intrinsic::amdgcn_rcp:
1689    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
1690  case Intrinsic::amdgcn_rsq:
1691  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
1692    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1693  case Intrinsic::amdgcn_rsq_legacy: {
1694    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
1695      return emitRemovedIntrinsicError(DAG, DL, VT);
1696
1697    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
1698  }
1699  case Intrinsic::amdgcn_rsq_clamp:
1700  case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name
1701    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
1702      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
1703
1704    Type *Type = VT.getTypeForEVT(*DAG.getContext());
1705    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
1706    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
1707
1708    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1709    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
1710                              DAG.getConstantFP(Max, DL, VT));
1711    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
1712                       DAG.getConstantFP(Min, DL, VT));
1713  }
1714  case Intrinsic::r600_read_ngroups_x:
1715    if (Subtarget->isAmdHsaOS())
1716      return emitNonHSAIntrinsicError(DAG, DL, VT);
1717
1718    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1719                          SI::KernelInputOffsets::NGROUPS_X, false);
1720  case Intrinsic::r600_read_ngroups_y:
1721    if (Subtarget->isAmdHsaOS())
1722      return emitNonHSAIntrinsicError(DAG, DL, VT);
1723
1724    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1725                          SI::KernelInputOffsets::NGROUPS_Y, false);
1726  case Intrinsic::r600_read_ngroups_z:
1727    if (Subtarget->isAmdHsaOS())
1728      return emitNonHSAIntrinsicError(DAG, DL, VT);
1729
1730    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1731                          SI::KernelInputOffsets::NGROUPS_Z, false);
1732  case Intrinsic::r600_read_global_size_x:
1733    if (Subtarget->isAmdHsaOS())
1734      return emitNonHSAIntrinsicError(DAG, DL, VT);
1735
1736    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1737                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
1738  case Intrinsic::r600_read_global_size_y:
1739    if (Subtarget->isAmdHsaOS())
1740      return emitNonHSAIntrinsicError(DAG, DL, VT);
1741
1742    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1743                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
1744  case Intrinsic::r600_read_global_size_z:
1745    if (Subtarget->isAmdHsaOS())
1746      return emitNonHSAIntrinsicError(DAG, DL, VT);
1747
1748    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1749                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
1750  case Intrinsic::r600_read_local_size_x:
1751    if (Subtarget->isAmdHsaOS())
1752      return emitNonHSAIntrinsicError(DAG, DL, VT);
1753
1754    return lowerImplicitZextParam(DAG, Op, MVT::i16,
1755                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
1756  case Intrinsic::r600_read_local_size_y:
1757    if (Subtarget->isAmdHsaOS())
1758      return emitNonHSAIntrinsicError(DAG, DL, VT);
1759
1760    return lowerImplicitZextParam(DAG, Op, MVT::i16,
1761                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
1762  case Intrinsic::r600_read_local_size_z:
1763    if (Subtarget->isAmdHsaOS())
1764      return emitNonHSAIntrinsicError(DAG, DL, VT);
1765
1766    return lowerImplicitZextParam(DAG, Op, MVT::i16,
1767                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
1768  case Intrinsic::amdgcn_read_workdim:
1769  case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
1770    // Really only 2 bits.
1771    return lowerImplicitZextParam(DAG, Op, MVT::i8,
1772                                  getImplicitParameterOffset(MFI, GRID_DIM));
1773  case Intrinsic::amdgcn_workgroup_id_x:
1774  case Intrinsic::r600_read_tgid_x:
1775    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1776      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
1777  case Intrinsic::amdgcn_workgroup_id_y:
1778  case Intrinsic::r600_read_tgid_y:
1779    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1780      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
1781  case Intrinsic::amdgcn_workgroup_id_z:
1782  case Intrinsic::r600_read_tgid_z:
1783    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1784      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
1785  case Intrinsic::amdgcn_workitem_id_x:
1786  case Intrinsic::r600_read_tidig_x:
1787    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1788      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
1789  case Intrinsic::amdgcn_workitem_id_y:
1790  case Intrinsic::r600_read_tidig_y:
1791    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1792      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
1793  case Intrinsic::amdgcn_workitem_id_z:
1794  case Intrinsic::r600_read_tidig_z:
1795    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1796      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
1797  case AMDGPUIntrinsic::SI_load_const: {
1798    SDValue Ops[] = {
1799      Op.getOperand(1),
1800      Op.getOperand(2)
1801    };
1802
1803    MachineMemOperand *MMO = MF.getMachineMemOperand(
1804      MachinePointerInfo(),
1805      MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
1806      VT.getStoreSize(), 4);
1807    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
1808                                   Op->getVTList(), Ops, VT, MMO);
1809  }
1810  case AMDGPUIntrinsic::SI_vs_load_input:
1811    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
1812                       Op.getOperand(1),
1813                       Op.getOperand(2),
1814                       Op.getOperand(3));
1815
1816  case AMDGPUIntrinsic::SI_fs_constant: {
1817    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1818    SDValue Glue = M0.getValue(1);
1819    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
1820                       DAG.getConstant(2, DL, MVT::i32), // P0
1821                       Op.getOperand(1), Op.getOperand(2), Glue);
1822  }
1823  case AMDGPUIntrinsic::SI_packf16:
1824    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
1825      return DAG.getUNDEF(MVT::i32);
1826    return Op;
1827  case AMDGPUIntrinsic::SI_fs_interp: {
1828    SDValue IJ = Op.getOperand(4);
1829    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1830                            DAG.getConstant(0, DL, MVT::i32));
1831    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1832                            DAG.getConstant(1, DL, MVT::i32));
1833    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1834    SDValue Glue = M0.getValue(1);
1835    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
1836                             DAG.getVTList(MVT::f32, MVT::Glue),
1837                             I, Op.getOperand(1), Op.getOperand(2), Glue);
1838    Glue = SDValue(P1.getNode(), 1);
1839    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
1840                             Op.getOperand(1), Op.getOperand(2), Glue);
1841  }
1842  case Intrinsic::amdgcn_interp_p1: {
1843    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
1844    SDValue Glue = M0.getValue(1);
1845    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
1846                       Op.getOperand(2), Op.getOperand(3), Glue);
1847  }
1848  case Intrinsic::amdgcn_interp_p2: {
1849    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
1850    SDValue Glue = SDValue(M0.getNode(), 1);
1851    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
1852                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
1853                       Glue);
1854  }
1855  case Intrinsic::amdgcn_sin:
1856    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
1857
1858  case Intrinsic::amdgcn_cos:
1859    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
1860
1861  case Intrinsic::amdgcn_log_clamp: {
1862    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
1863      return SDValue();
1864
1865    DiagnosticInfoUnsupported BadIntrin(
1866      *MF.getFunction(), "intrinsic not supported on subtarget",
1867      DL.getDebugLoc());
1868      DAG.getContext()->diagnose(BadIntrin);
1869      return DAG.getUNDEF(VT);
1870  }
1871  case Intrinsic::amdgcn_ldexp:
1872    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
1873                       Op.getOperand(1), Op.getOperand(2));
1874
1875  case Intrinsic::amdgcn_fract:
1876    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
1877
1878  case Intrinsic::amdgcn_class:
1879    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
1880                       Op.getOperand(1), Op.getOperand(2));
1881  case Intrinsic::amdgcn_div_fmas:
1882    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
1883                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
1884                       Op.getOperand(4));
1885
1886  case Intrinsic::amdgcn_div_fixup:
1887    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
1888                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
1889
1890  case Intrinsic::amdgcn_trig_preop:
1891    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
1892                       Op.getOperand(1), Op.getOperand(2));
1893  case Intrinsic::amdgcn_div_scale: {
1894    // 3rd parameter required to be a constant.
1895    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
1896    if (!Param)
1897      return DAG.getUNDEF(VT);
1898
1899    // Translate to the operands expected by the machine instruction. The
1900    // first parameter must be the same as the first instruction.
1901    SDValue Numerator = Op.getOperand(1);
1902    SDValue Denominator = Op.getOperand(2);
1903
1904    // Note this order is opposite of the machine instruction's operations,
1905    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
1906    // intrinsic has the numerator as the first operand to match a normal
1907    // division operation.
1908
1909    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
1910
1911    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
1912                       Denominator, Numerator);
1913  }
1914  default:
1915    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1916  }
1917}
1918
1919SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
1920                                                 SelectionDAG &DAG) const {
1921  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1922  switch (IntrID) {
1923  case Intrinsic::amdgcn_atomic_inc:
1924  case Intrinsic::amdgcn_atomic_dec: {
1925    MemSDNode *M = cast<MemSDNode>(Op);
1926    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
1927      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
1928    SDValue Ops[] = {
1929      M->getOperand(0), // Chain
1930      M->getOperand(2), // Ptr
1931      M->getOperand(3)  // Value
1932    };
1933
1934    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
1935                                   M->getMemoryVT(), M->getMemOperand());
1936  }
1937  default:
1938    return SDValue();
1939  }
1940}
1941
1942SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1943                                              SelectionDAG &DAG) const {
1944  MachineFunction &MF = DAG.getMachineFunction();
1945  SDLoc DL(Op);
1946  SDValue Chain = Op.getOperand(0);
1947  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1948
1949  switch (IntrinsicID) {
1950  case AMDGPUIntrinsic::SI_sendmsg: {
1951    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
1952    SDValue Glue = Chain.getValue(1);
1953    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
1954                       Op.getOperand(2), Glue);
1955  }
1956  case AMDGPUIntrinsic::SI_tbuffer_store: {
1957    SDValue Ops[] = {
1958      Chain,
1959      Op.getOperand(2),
1960      Op.getOperand(3),
1961      Op.getOperand(4),
1962      Op.getOperand(5),
1963      Op.getOperand(6),
1964      Op.getOperand(7),
1965      Op.getOperand(8),
1966      Op.getOperand(9),
1967      Op.getOperand(10),
1968      Op.getOperand(11),
1969      Op.getOperand(12),
1970      Op.getOperand(13),
1971      Op.getOperand(14)
1972    };
1973
1974    EVT VT = Op.getOperand(3).getValueType();
1975
1976    MachineMemOperand *MMO = MF.getMachineMemOperand(
1977      MachinePointerInfo(),
1978      MachineMemOperand::MOStore,
1979      VT.getStoreSize(), 4);
1980    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
1981                                   Op->getVTList(), Ops, VT, MMO);
1982  }
1983  case AMDGPUIntrinsic::AMDGPU_kill: {
1984    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
1985      if (!K->isNegative())
1986        return Chain;
1987    }
1988
1989    return Op;
1990  }
1991  default:
1992    return SDValue();
1993  }
1994}
1995
1996SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1997  SDLoc DL(Op);
1998  LoadSDNode *Load = cast<LoadSDNode>(Op);
1999  ISD::LoadExtType ExtType = Load->getExtensionType();
2000  EVT MemVT = Load->getMemoryVT();
2001
2002  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
2003    assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
2004    // FIXME: Copied from PPC
2005    // First, load into 32 bits, then truncate to 1 bit.
2006
2007    SDValue Chain = Load->getChain();
2008    SDValue BasePtr = Load->getBasePtr();
2009    MachineMemOperand *MMO = Load->getMemOperand();
2010
2011    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
2012                                   BasePtr, MVT::i8, MMO);
2013
2014    SDValue Ops[] = {
2015      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
2016      NewLD.getValue(1)
2017    };
2018
2019    return DAG.getMergeValues(Ops, DL);
2020  }
2021
2022  if (!MemVT.isVector())
2023    return SDValue();
2024
2025  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
2026         "Custom lowering for non-i32 vectors hasn't been implemented.");
2027
2028  unsigned AS = Load->getAddressSpace();
2029  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2030                          AS, Load->getAlignment())) {
2031    SDValue Ops[2];
2032    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2033    return DAG.getMergeValues(Ops, DL);
2034  }
2035
2036  unsigned NumElements = MemVT.getVectorNumElements();
2037  switch (AS) {
2038  case AMDGPUAS::CONSTANT_ADDRESS:
2039    if (isMemOpUniform(Load))
2040      return SDValue();
2041    // Non-uniform loads will be selected to MUBUF instructions, so they
2042    // have the same legalization requires ments as global and private
2043    // loads.
2044    //
2045    // Fall-through
2046  case AMDGPUAS::GLOBAL_ADDRESS:
2047  case AMDGPUAS::FLAT_ADDRESS:
2048    if (NumElements > 4)
2049      return SplitVectorLoad(Op, DAG);
2050    // v4 loads are supported for private and global memory.
2051    return SDValue();
2052  case AMDGPUAS::PRIVATE_ADDRESS: {
2053    // Depending on the setting of the private_element_size field in the
2054    // resource descriptor, we can only make private accesses up to a certain
2055    // size.
2056    switch (Subtarget->getMaxPrivateElementSize()) {
2057    case 4:
2058      return scalarizeVectorLoad(Load, DAG);
2059    case 8:
2060      if (NumElements > 2)
2061        return SplitVectorLoad(Op, DAG);
2062      return SDValue();
2063    case 16:
2064      // Same as global/flat
2065      if (NumElements > 4)
2066        return SplitVectorLoad(Op, DAG);
2067      return SDValue();
2068    default:
2069      llvm_unreachable("unsupported private_element_size");
2070    }
2071  }
2072  case AMDGPUAS::LOCAL_ADDRESS: {
2073    if (NumElements > 2)
2074      return SplitVectorLoad(Op, DAG);
2075
2076    if (NumElements == 2)
2077      return SDValue();
2078
2079    // If properly aligned, if we split we might be able to use ds_read_b64.
2080    return SplitVectorLoad(Op, DAG);
2081  }
2082  default:
2083    return SDValue();
2084  }
2085}
2086
2087SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2088  if (Op.getValueType() != MVT::i64)
2089    return SDValue();
2090
2091  SDLoc DL(Op);
2092  SDValue Cond = Op.getOperand(0);
2093
2094  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2095  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2096
2097  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
2098  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
2099
2100  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
2101  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
2102
2103  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
2104
2105  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
2106  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
2107
2108  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
2109
2110  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
2111  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
2112}
2113
2114// Catch division cases where we can use shortcuts with rcp and rsq
2115// instructions.
2116SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
2117  SDLoc SL(Op);
2118  SDValue LHS = Op.getOperand(0);
2119  SDValue RHS = Op.getOperand(1);
2120  EVT VT = Op.getValueType();
2121  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
2122
2123  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
2124    if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
2125        CLHS->isExactlyValue(1.0)) {
2126      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
2127      // the CI documentation has a worst case error of 1 ulp.
2128      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
2129      // use it as long as we aren't trying to use denormals.
2130
2131      // 1.0 / sqrt(x) -> rsq(x)
2132      //
2133      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
2134      // error seems really high at 2^29 ULP.
2135      if (RHS.getOpcode() == ISD::FSQRT)
2136        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
2137
2138      // 1.0 / x -> rcp(x)
2139      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
2140    }
2141  }
2142
2143  const SDNodeFlags *Flags = Op->getFlags();
2144
2145  if (Unsafe || Flags->hasAllowReciprocal()) {
2146    // Turn into multiply by the reciprocal.
2147    // x / y -> x * (1.0 / y)
2148    SDNodeFlags Flags;
2149    Flags.setUnsafeAlgebra(true);
2150    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
2151    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
2152  }
2153
2154  return SDValue();
2155}
2156
2157SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
2158  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
2159    return FastLowered;
2160
2161  SDLoc SL(Op);
2162  SDValue LHS = Op.getOperand(0);
2163  SDValue RHS = Op.getOperand(1);
2164
2165  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
2166  if (EnableAMDGPUFastFDIV) {
2167    // This does not support denormals.
2168    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
2169
2170    const APFloat K0Val(BitsToFloat(0x6f800000));
2171    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
2172
2173    const APFloat K1Val(BitsToFloat(0x2f800000));
2174    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
2175
2176    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2177
2178    EVT SetCCVT =
2179        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
2180
2181    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
2182
2183    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
2184
2185    // TODO: Should this propagate fast-math-flags?
2186
2187    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
2188
2189    // rcp does not support denormals.
2190    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
2191
2192    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
2193
2194    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
2195  }
2196
2197  // Generates more precise fpdiv32.
2198  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2199
2200  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
2201
2202  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
2203  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
2204
2205  // Denominator is scaled to not be denormal, so using rcp is ok.
2206  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
2207
2208  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
2209
2210  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
2211  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
2212
2213  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
2214
2215  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
2216  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
2217  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
2218
2219  SDValue Scale = NumeratorScaled.getValue(1);
2220  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
2221
2222  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
2223}
2224
2225SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
2226  if (DAG.getTarget().Options.UnsafeFPMath)
2227    return LowerFastFDIV(Op, DAG);
2228
2229  SDLoc SL(Op);
2230  SDValue X = Op.getOperand(0);
2231  SDValue Y = Op.getOperand(1);
2232
2233  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2234
2235  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
2236
2237  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
2238
2239  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
2240
2241  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
2242
2243  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
2244
2245  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
2246
2247  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
2248
2249  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
2250
2251  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
2252  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
2253
2254  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
2255                             NegDivScale0, Mul, DivScale1);
2256
2257  SDValue Scale;
2258
2259  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
2260    // Workaround a hardware bug on SI where the condition output from div_scale
2261    // is not usable.
2262
2263    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
2264
2265    // Figure out if the scale to use for div_fmas.
2266    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2267    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
2268    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
2269    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
2270
2271    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
2272    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
2273
2274    SDValue Scale0Hi
2275      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
2276    SDValue Scale1Hi
2277      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
2278
2279    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
2280    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
2281    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
2282  } else {
2283    Scale = DivScale1.getValue(1);
2284  }
2285
2286  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
2287                             Fma4, Fma3, Mul, Scale);
2288
2289  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
2290}
2291
2292SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
2293  EVT VT = Op.getValueType();
2294
2295  if (VT == MVT::f32)
2296    return LowerFDIV32(Op, DAG);
2297
2298  if (VT == MVT::f64)
2299    return LowerFDIV64(Op, DAG);
2300
2301  llvm_unreachable("Unexpected type for fdiv");
2302}
2303
2304SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2305  SDLoc DL(Op);
2306  StoreSDNode *Store = cast<StoreSDNode>(Op);
2307  EVT VT = Store->getMemoryVT();
2308
2309  if (VT == MVT::i1) {
2310    return DAG.getTruncStore(Store->getChain(), DL,
2311       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
2312       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
2313  }
2314
2315  assert(VT.isVector() &&
2316         Store->getValue().getValueType().getScalarType() == MVT::i32);
2317
2318  unsigned AS = Store->getAddressSpace();
2319  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2320                          AS, Store->getAlignment())) {
2321    return expandUnalignedStore(Store, DAG);
2322  }
2323
2324  unsigned NumElements = VT.getVectorNumElements();
2325  switch (AS) {
2326  case AMDGPUAS::GLOBAL_ADDRESS:
2327  case AMDGPUAS::FLAT_ADDRESS:
2328    if (NumElements > 4)
2329      return SplitVectorStore(Op, DAG);
2330    return SDValue();
2331  case AMDGPUAS::PRIVATE_ADDRESS: {
2332    switch (Subtarget->getMaxPrivateElementSize()) {
2333    case 4:
2334      return scalarizeVectorStore(Store, DAG);
2335    case 8:
2336      if (NumElements > 2)
2337        return SplitVectorStore(Op, DAG);
2338      return SDValue();
2339    case 16:
2340      if (NumElements > 4)
2341        return SplitVectorStore(Op, DAG);
2342      return SDValue();
2343    default:
2344      llvm_unreachable("unsupported private_element_size");
2345    }
2346  }
2347  case AMDGPUAS::LOCAL_ADDRESS: {
2348    if (NumElements > 2)
2349      return SplitVectorStore(Op, DAG);
2350
2351    if (NumElements == 2)
2352      return Op;
2353
2354    // If properly aligned, if we split we might be able to use ds_write_b64.
2355    return SplitVectorStore(Op, DAG);
2356  }
2357  default:
2358    llvm_unreachable("unhandled address space");
2359  }
2360}
2361
2362SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
2363  SDLoc DL(Op);
2364  EVT VT = Op.getValueType();
2365  SDValue Arg = Op.getOperand(0);
2366  // TODO: Should this propagate fast-math-flags?
2367  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
2368                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
2369                                              DAG.getConstantFP(0.5/M_PI, DL,
2370                                                                VT)));
2371
2372  switch (Op.getOpcode()) {
2373  case ISD::FCOS:
2374    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
2375  case ISD::FSIN:
2376    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
2377  default:
2378    llvm_unreachable("Wrong trig opcode");
2379  }
2380}
2381
2382SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
2383  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
2384  assert(AtomicNode->isCompareAndSwap());
2385  unsigned AS = AtomicNode->getAddressSpace();
2386
2387  // No custom lowering required for local address space
2388  if (!isFlatGlobalAddrSpace(AS))
2389    return Op;
2390
2391  // Non-local address space requires custom lowering for atomic compare
2392  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
2393  SDLoc DL(Op);
2394  SDValue ChainIn = Op.getOperand(0);
2395  SDValue Addr = Op.getOperand(1);
2396  SDValue Old = Op.getOperand(2);
2397  SDValue New = Op.getOperand(3);
2398  EVT VT = Op.getValueType();
2399  MVT SimpleVT = VT.getSimpleVT();
2400  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
2401
2402  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
2403  SDValue Ops[] = { ChainIn, Addr, NewOld };
2404
2405  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
2406                                 Ops, VT, AtomicNode->getMemOperand());
2407}
2408
2409//===----------------------------------------------------------------------===//
2410// Custom DAG optimizations
2411//===----------------------------------------------------------------------===//
2412
2413SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
2414                                                     DAGCombinerInfo &DCI) const {
2415  EVT VT = N->getValueType(0);
2416  EVT ScalarVT = VT.getScalarType();
2417  if (ScalarVT != MVT::f32)
2418    return SDValue();
2419
2420  SelectionDAG &DAG = DCI.DAG;
2421  SDLoc DL(N);
2422
2423  SDValue Src = N->getOperand(0);
2424  EVT SrcVT = Src.getValueType();
2425
2426  // TODO: We could try to match extracting the higher bytes, which would be
2427  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2428  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2429  // about in practice.
2430  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
2431    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
2432      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
2433      DCI.AddToWorklist(Cvt.getNode());
2434      return Cvt;
2435    }
2436  }
2437
2438  return SDValue();
2439}
2440
2441/// \brief Return true if the given offset Size in bytes can be folded into
2442/// the immediate offsets of a memory instruction for the given address space.
2443static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
2444                          const SISubtarget &STI) {
2445  switch (AS) {
2446  case AMDGPUAS::GLOBAL_ADDRESS: {
2447    // MUBUF instructions a 12-bit offset in bytes.
2448    return isUInt<12>(OffsetSize);
2449  }
2450  case AMDGPUAS::CONSTANT_ADDRESS: {
2451    // SMRD instructions have an 8-bit offset in dwords on SI and
2452    // a 20-bit offset in bytes on VI.
2453    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
2454      return isUInt<20>(OffsetSize);
2455    else
2456      return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
2457  }
2458  case AMDGPUAS::LOCAL_ADDRESS:
2459  case AMDGPUAS::REGION_ADDRESS: {
2460    // The single offset versions have a 16-bit offset in bytes.
2461    return isUInt<16>(OffsetSize);
2462  }
2463  case AMDGPUAS::PRIVATE_ADDRESS:
2464  // Indirect register addressing does not use any offsets.
2465  default:
2466    return 0;
2467  }
2468}
2469
2470// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
2471
2472// This is a variant of
2473// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
2474//
2475// The normal DAG combiner will do this, but only if the add has one use since
2476// that would increase the number of instructions.
2477//
2478// This prevents us from seeing a constant offset that can be folded into a
2479// memory instruction's addressing mode. If we know the resulting add offset of
2480// a pointer can be folded into an addressing offset, we can replace the pointer
2481// operand with the add of new constant offset. This eliminates one of the uses,
2482// and may allow the remaining use to also be simplified.
2483//
2484SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
2485                                               unsigned AddrSpace,
2486                                               DAGCombinerInfo &DCI) const {
2487  SDValue N0 = N->getOperand(0);
2488  SDValue N1 = N->getOperand(1);
2489
2490  if (N0.getOpcode() != ISD::ADD)
2491    return SDValue();
2492
2493  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
2494  if (!CN1)
2495    return SDValue();
2496
2497  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
2498  if (!CAdd)
2499    return SDValue();
2500
2501  // If the resulting offset is too large, we can't fold it into the addressing
2502  // mode offset.
2503  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
2504  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
2505    return SDValue();
2506
2507  SelectionDAG &DAG = DCI.DAG;
2508  SDLoc SL(N);
2509  EVT VT = N->getValueType(0);
2510
2511  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
2512  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
2513
2514  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
2515}
2516
2517SDValue SITargetLowering::performAndCombine(SDNode *N,
2518                                            DAGCombinerInfo &DCI) const {
2519  if (DCI.isBeforeLegalize())
2520    return SDValue();
2521
2522  if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
2523    return Base;
2524
2525  SelectionDAG &DAG = DCI.DAG;
2526
2527  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
2528  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
2529  SDValue LHS = N->getOperand(0);
2530  SDValue RHS = N->getOperand(1);
2531
2532  if (LHS.getOpcode() == ISD::SETCC &&
2533      RHS.getOpcode() == ISD::SETCC) {
2534    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
2535    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
2536
2537    SDValue X = LHS.getOperand(0);
2538    SDValue Y = RHS.getOperand(0);
2539    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
2540      return SDValue();
2541
2542    if (LCC == ISD::SETO) {
2543      if (X != LHS.getOperand(1))
2544        return SDValue();
2545
2546      if (RCC == ISD::SETUNE) {
2547        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
2548        if (!C1 || !C1->isInfinity() || C1->isNegative())
2549          return SDValue();
2550
2551        const uint32_t Mask = SIInstrFlags::N_NORMAL |
2552                              SIInstrFlags::N_SUBNORMAL |
2553                              SIInstrFlags::N_ZERO |
2554                              SIInstrFlags::P_ZERO |
2555                              SIInstrFlags::P_SUBNORMAL |
2556                              SIInstrFlags::P_NORMAL;
2557
2558        static_assert(((~(SIInstrFlags::S_NAN |
2559                          SIInstrFlags::Q_NAN |
2560                          SIInstrFlags::N_INFINITY |
2561                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
2562                      "mask not equal");
2563
2564        SDLoc DL(N);
2565        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2566                           X, DAG.getConstant(Mask, DL, MVT::i32));
2567      }
2568    }
2569  }
2570
2571  return SDValue();
2572}
2573
2574SDValue SITargetLowering::performOrCombine(SDNode *N,
2575                                           DAGCombinerInfo &DCI) const {
2576  SelectionDAG &DAG = DCI.DAG;
2577  SDValue LHS = N->getOperand(0);
2578  SDValue RHS = N->getOperand(1);
2579
2580  EVT VT = N->getValueType(0);
2581  if (VT == MVT::i64) {
2582    // TODO: This could be a generic combine with a predicate for extracting the
2583    // high half of an integer being free.
2584
2585    // (or i64:x, (zero_extend i32:y)) ->
2586    //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
2587    if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
2588        RHS.getOpcode() != ISD::ZERO_EXTEND)
2589      std::swap(LHS, RHS);
2590
2591    if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
2592      SDValue ExtSrc = RHS.getOperand(0);
2593      EVT SrcVT = ExtSrc.getValueType();
2594      if (SrcVT == MVT::i32) {
2595        SDLoc SL(N);
2596        SDValue LowLHS, HiBits;
2597        std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
2598        SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
2599
2600        DCI.AddToWorklist(LowOr.getNode());
2601        DCI.AddToWorklist(HiBits.getNode());
2602
2603        SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2604                                  LowOr, HiBits);
2605        return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2606      }
2607    }
2608  }
2609
2610  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
2611  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
2612      RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
2613    SDValue Src = LHS.getOperand(0);
2614    if (Src != RHS.getOperand(0))
2615      return SDValue();
2616
2617    const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
2618    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
2619    if (!CLHS || !CRHS)
2620      return SDValue();
2621
2622    // Only 10 bits are used.
2623    static const uint32_t MaxMask = 0x3ff;
2624
2625    uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
2626    SDLoc DL(N);
2627    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2628                       Src, DAG.getConstant(NewMask, DL, MVT::i32));
2629  }
2630
2631  return SDValue();
2632}
2633
2634SDValue SITargetLowering::performClassCombine(SDNode *N,
2635                                              DAGCombinerInfo &DCI) const {
2636  SelectionDAG &DAG = DCI.DAG;
2637  SDValue Mask = N->getOperand(1);
2638
2639  // fp_class x, 0 -> false
2640  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
2641    if (CMask->isNullValue())
2642      return DAG.getConstant(0, SDLoc(N), MVT::i1);
2643  }
2644
2645  if (N->getOperand(0).isUndef())
2646    return DAG.getUNDEF(MVT::i1);
2647
2648  return SDValue();
2649}
2650
2651// Constant fold canonicalize.
2652SDValue SITargetLowering::performFCanonicalizeCombine(
2653  SDNode *N,
2654  DAGCombinerInfo &DCI) const {
2655  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2656  if (!CFP)
2657    return SDValue();
2658
2659  SelectionDAG &DAG = DCI.DAG;
2660  const APFloat &C = CFP->getValueAPF();
2661
2662  // Flush denormals to 0 if not enabled.
2663  if (C.isDenormal()) {
2664    EVT VT = N->getValueType(0);
2665    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
2666      return DAG.getConstantFP(0.0, SDLoc(N), VT);
2667
2668    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
2669      return DAG.getConstantFP(0.0, SDLoc(N), VT);
2670  }
2671
2672  if (C.isNaN()) {
2673    EVT VT = N->getValueType(0);
2674    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
2675    if (C.isSignaling()) {
2676      // Quiet a signaling NaN.
2677      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2678    }
2679
2680    // Make sure it is the canonical NaN bitpattern.
2681    //
2682    // TODO: Can we use -1 as the canonical NaN value since it's an inline
2683    // immediate?
2684    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
2685      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2686  }
2687
2688  return SDValue(CFP, 0);
2689}
2690
2691static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
2692  switch (Opc) {
2693  case ISD::FMAXNUM:
2694    return AMDGPUISD::FMAX3;
2695  case ISD::SMAX:
2696    return AMDGPUISD::SMAX3;
2697  case ISD::UMAX:
2698    return AMDGPUISD::UMAX3;
2699  case ISD::FMINNUM:
2700    return AMDGPUISD::FMIN3;
2701  case ISD::SMIN:
2702    return AMDGPUISD::SMIN3;
2703  case ISD::UMIN:
2704    return AMDGPUISD::UMIN3;
2705  default:
2706    llvm_unreachable("Not a min/max opcode");
2707  }
2708}
2709
2710static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
2711                                        SDValue Op0, SDValue Op1, bool Signed) {
2712  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
2713  if (!K1)
2714    return SDValue();
2715
2716  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
2717  if (!K0)
2718    return SDValue();
2719
2720  if (Signed) {
2721    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
2722      return SDValue();
2723  } else {
2724    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
2725      return SDValue();
2726  }
2727
2728  EVT VT = K0->getValueType(0);
2729  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
2730                     Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
2731}
2732
2733static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
2734  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
2735    return true;
2736
2737  return DAG.isKnownNeverNaN(Op);
2738}
2739
2740static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
2741                                       SDValue Op0, SDValue Op1) {
2742  ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
2743  if (!K1)
2744    return SDValue();
2745
2746  ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
2747  if (!K0)
2748    return SDValue();
2749
2750  // Ordered >= (although NaN inputs should have folded away by now).
2751  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
2752  if (Cmp == APFloat::cmpGreaterThan)
2753    return SDValue();
2754
2755  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
2756  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
2757  // give the other result, which is different from med3 with a NaN input.
2758  SDValue Var = Op0.getOperand(0);
2759  if (!isKnownNeverSNan(DAG, Var))
2760    return SDValue();
2761
2762  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
2763                     Var, SDValue(K0, 0), SDValue(K1, 0));
2764}
2765
2766SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
2767                                               DAGCombinerInfo &DCI) const {
2768  SelectionDAG &DAG = DCI.DAG;
2769
2770  unsigned Opc = N->getOpcode();
2771  SDValue Op0 = N->getOperand(0);
2772  SDValue Op1 = N->getOperand(1);
2773
2774  // Only do this if the inner op has one use since this will just increases
2775  // register pressure for no benefit.
2776
2777  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
2778    // max(max(a, b), c) -> max3(a, b, c)
2779    // min(min(a, b), c) -> min3(a, b, c)
2780    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
2781      SDLoc DL(N);
2782      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2783                         DL,
2784                         N->getValueType(0),
2785                         Op0.getOperand(0),
2786                         Op0.getOperand(1),
2787                         Op1);
2788    }
2789
2790    // Try commuted.
2791    // max(a, max(b, c)) -> max3(a, b, c)
2792    // min(a, min(b, c)) -> min3(a, b, c)
2793    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
2794      SDLoc DL(N);
2795      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2796                         DL,
2797                         N->getValueType(0),
2798                         Op0,
2799                         Op1.getOperand(0),
2800                         Op1.getOperand(1));
2801    }
2802  }
2803
2804  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
2805  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
2806    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
2807      return Med3;
2808  }
2809
2810  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
2811    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
2812      return Med3;
2813  }
2814
2815  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
2816  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
2817       (Opc == AMDGPUISD::FMIN_LEGACY &&
2818        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
2819      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
2820    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
2821      return Res;
2822  }
2823
2824  return SDValue();
2825}
2826
2827SDValue SITargetLowering::performSetCCCombine(SDNode *N,
2828                                              DAGCombinerInfo &DCI) const {
2829  SelectionDAG &DAG = DCI.DAG;
2830  SDLoc SL(N);
2831
2832  SDValue LHS = N->getOperand(0);
2833  SDValue RHS = N->getOperand(1);
2834  EVT VT = LHS.getValueType();
2835
2836  if (VT != MVT::f32 && VT != MVT::f64)
2837    return SDValue();
2838
2839  // Match isinf pattern
2840  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
2841  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
2842  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
2843    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2844    if (!CRHS)
2845      return SDValue();
2846
2847    const APFloat &APF = CRHS->getValueAPF();
2848    if (APF.isInfinity() && !APF.isNegative()) {
2849      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
2850      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
2851                         DAG.getConstant(Mask, SL, MVT::i32));
2852    }
2853  }
2854
2855  return SDValue();
2856}
2857
2858SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
2859                                            DAGCombinerInfo &DCI) const {
2860  SelectionDAG &DAG = DCI.DAG;
2861  SDLoc DL(N);
2862
2863  switch (N->getOpcode()) {
2864  default:
2865    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2866  case ISD::SETCC:
2867    return performSetCCCombine(N, DCI);
2868  case ISD::FMAXNUM:
2869  case ISD::FMINNUM:
2870  case ISD::SMAX:
2871  case ISD::SMIN:
2872  case ISD::UMAX:
2873  case ISD::UMIN:
2874  case AMDGPUISD::FMIN_LEGACY:
2875  case AMDGPUISD::FMAX_LEGACY: {
2876    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
2877        N->getValueType(0) != MVT::f64 &&
2878        getTargetMachine().getOptLevel() > CodeGenOpt::None)
2879      return performMinMaxCombine(N, DCI);
2880    break;
2881  }
2882
2883  case AMDGPUISD::CVT_F32_UBYTE0:
2884  case AMDGPUISD::CVT_F32_UBYTE1:
2885  case AMDGPUISD::CVT_F32_UBYTE2:
2886  case AMDGPUISD::CVT_F32_UBYTE3: {
2887    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
2888    SDValue Src = N->getOperand(0);
2889
2890    // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
2891    if (Src.getOpcode() == ISD::SRL) {
2892      // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
2893      // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
2894      // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
2895
2896      if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2897        unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
2898        if (SrcOffset < 32 && SrcOffset % 8 == 0) {
2899          return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
2900                             MVT::f32, Src.getOperand(0));
2901        }
2902      }
2903    }
2904
2905    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
2906
2907    APInt KnownZero, KnownOne;
2908    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
2909                                          !DCI.isBeforeLegalizeOps());
2910    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2911    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
2912        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
2913      DCI.CommitTargetLoweringOpt(TLO);
2914    }
2915
2916    break;
2917  }
2918
2919  case ISD::UINT_TO_FP: {
2920    return performUCharToFloatCombine(N, DCI);
2921  }
2922  case ISD::FADD: {
2923    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2924      break;
2925
2926    EVT VT = N->getValueType(0);
2927    if (VT != MVT::f32)
2928      break;
2929
2930    // Only do this if we are not trying to support denormals. v_mad_f32 does
2931    // not support denormals ever.
2932    if (Subtarget->hasFP32Denormals())
2933      break;
2934
2935    SDValue LHS = N->getOperand(0);
2936    SDValue RHS = N->getOperand(1);
2937
2938    // These should really be instruction patterns, but writing patterns with
2939    // source modiifiers is a pain.
2940
2941    // fadd (fadd (a, a), b) -> mad 2.0, a, b
2942    if (LHS.getOpcode() == ISD::FADD) {
2943      SDValue A = LHS.getOperand(0);
2944      if (A == LHS.getOperand(1)) {
2945        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2946        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
2947      }
2948    }
2949
2950    // fadd (b, fadd (a, a)) -> mad 2.0, a, b
2951    if (RHS.getOpcode() == ISD::FADD) {
2952      SDValue A = RHS.getOperand(0);
2953      if (A == RHS.getOperand(1)) {
2954        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2955        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
2956      }
2957    }
2958
2959    return SDValue();
2960  }
2961  case ISD::FSUB: {
2962    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2963      break;
2964
2965    EVT VT = N->getValueType(0);
2966
2967    // Try to get the fneg to fold into the source modifier. This undoes generic
2968    // DAG combines and folds them into the mad.
2969    //
2970    // Only do this if we are not trying to support denormals. v_mad_f32 does
2971    // not support denormals ever.
2972    if (VT == MVT::f32 &&
2973        !Subtarget->hasFP32Denormals()) {
2974      SDValue LHS = N->getOperand(0);
2975      SDValue RHS = N->getOperand(1);
2976      if (LHS.getOpcode() == ISD::FADD) {
2977        // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
2978
2979        SDValue A = LHS.getOperand(0);
2980        if (A == LHS.getOperand(1)) {
2981          const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2982          SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
2983
2984          return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
2985        }
2986      }
2987
2988      if (RHS.getOpcode() == ISD::FADD) {
2989        // (fsub c, (fadd a, a)) -> mad -2.0, a, c
2990
2991        SDValue A = RHS.getOperand(0);
2992        if (A == RHS.getOperand(1)) {
2993          const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
2994          return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
2995        }
2996      }
2997
2998      return SDValue();
2999    }
3000
3001    break;
3002  }
3003  case ISD::LOAD:
3004  case ISD::STORE:
3005  case ISD::ATOMIC_LOAD:
3006  case ISD::ATOMIC_STORE:
3007  case ISD::ATOMIC_CMP_SWAP:
3008  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
3009  case ISD::ATOMIC_SWAP:
3010  case ISD::ATOMIC_LOAD_ADD:
3011  case ISD::ATOMIC_LOAD_SUB:
3012  case ISD::ATOMIC_LOAD_AND:
3013  case ISD::ATOMIC_LOAD_OR:
3014  case ISD::ATOMIC_LOAD_XOR:
3015  case ISD::ATOMIC_LOAD_NAND:
3016  case ISD::ATOMIC_LOAD_MIN:
3017  case ISD::ATOMIC_LOAD_MAX:
3018  case ISD::ATOMIC_LOAD_UMIN:
3019  case ISD::ATOMIC_LOAD_UMAX:
3020  case AMDGPUISD::ATOMIC_INC:
3021  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
3022    if (DCI.isBeforeLegalize())
3023      break;
3024
3025    MemSDNode *MemNode = cast<MemSDNode>(N);
3026    SDValue Ptr = MemNode->getBasePtr();
3027
3028    // TODO: We could also do this for multiplies.
3029    unsigned AS = MemNode->getAddressSpace();
3030    if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
3031      SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
3032      if (NewPtr) {
3033        SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
3034
3035        NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
3036        return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
3037      }
3038    }
3039    break;
3040  }
3041  case ISD::AND:
3042    return performAndCombine(N, DCI);
3043  case ISD::OR:
3044    return performOrCombine(N, DCI);
3045  case AMDGPUISD::FP_CLASS:
3046    return performClassCombine(N, DCI);
3047  case ISD::FCANONICALIZE:
3048    return performFCanonicalizeCombine(N, DCI);
3049  case AMDGPUISD::FRACT:
3050  case AMDGPUISD::RCP:
3051  case AMDGPUISD::RSQ:
3052  case AMDGPUISD::RSQ_LEGACY:
3053  case AMDGPUISD::RSQ_CLAMP:
3054  case AMDGPUISD::LDEXP: {
3055    SDValue Src = N->getOperand(0);
3056    if (Src.isUndef())
3057      return Src;
3058    break;
3059  }
3060  }
3061  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
3062}
3063
3064/// \brief Analyze the possible immediate value Op
3065///
3066/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
3067/// and the immediate value if it's a literal immediate
3068int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
3069  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3070
3071  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
3072    if (TII->isInlineConstant(Node->getAPIntValue()))
3073      return 0;
3074
3075    uint64_t Val = Node->getZExtValue();
3076    return isUInt<32>(Val) ? Val : -1;
3077  }
3078
3079  if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
3080    if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
3081      return 0;
3082
3083    if (Node->getValueType(0) == MVT::f32)
3084      return FloatToBits(Node->getValueAPF().convertToFloat());
3085
3086    return -1;
3087  }
3088
3089  return -1;
3090}
3091
3092/// \brief Helper function for adjustWritemask
3093static unsigned SubIdx2Lane(unsigned Idx) {
3094  switch (Idx) {
3095  default: return 0;
3096  case AMDGPU::sub0: return 0;
3097  case AMDGPU::sub1: return 1;
3098  case AMDGPU::sub2: return 2;
3099  case AMDGPU::sub3: return 3;
3100  }
3101}
3102
3103/// \brief Adjust the writemask of MIMG instructions
3104void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
3105                                       SelectionDAG &DAG) const {
3106  SDNode *Users[4] = { };
3107  unsigned Lane = 0;
3108  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
3109  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
3110  unsigned NewDmask = 0;
3111
3112  // Try to figure out the used register components
3113  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
3114       I != E; ++I) {
3115
3116    // Abort if we can't understand the usage
3117    if (!I->isMachineOpcode() ||
3118        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
3119      return;
3120
3121    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
3122    // Note that subregs are packed, i.e. Lane==0 is the first bit set
3123    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
3124    // set, etc.
3125    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
3126
3127    // Set which texture component corresponds to the lane.
3128    unsigned Comp;
3129    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
3130      assert(Dmask);
3131      Comp = countTrailingZeros(Dmask);
3132      Dmask &= ~(1 << Comp);
3133    }
3134
3135    // Abort if we have more than one user per component
3136    if (Users[Lane])
3137      return;
3138
3139    Users[Lane] = *I;
3140    NewDmask |= 1 << Comp;
3141  }
3142
3143  // Abort if there's no change
3144  if (NewDmask == OldDmask)
3145    return;
3146
3147  // Adjust the writemask in the node
3148  std::vector<SDValue> Ops;
3149  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
3150  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
3151  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
3152  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
3153
3154  // If we only got one lane, replace it with a copy
3155  // (if NewDmask has only one bit set...)
3156  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
3157    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
3158                                       MVT::i32);
3159    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
3160                                      SDLoc(), Users[Lane]->getValueType(0),
3161                                      SDValue(Node, 0), RC);
3162    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
3163    return;
3164  }
3165
3166  // Update the users of the node with the new indices
3167  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
3168
3169    SDNode *User = Users[i];
3170    if (!User)
3171      continue;
3172
3173    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
3174    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
3175
3176    switch (Idx) {
3177    default: break;
3178    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
3179    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
3180    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
3181    }
3182  }
3183}
3184
3185static bool isFrameIndexOp(SDValue Op) {
3186  if (Op.getOpcode() == ISD::AssertZext)
3187    Op = Op.getOperand(0);
3188
3189  return isa<FrameIndexSDNode>(Op);
3190}
3191
3192/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
3193/// with frame index operands.
3194/// LLVM assumes that inputs are to these instructions are registers.
3195void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
3196                                                     SelectionDAG &DAG) const {
3197
3198  SmallVector<SDValue, 8> Ops;
3199  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
3200    if (!isFrameIndexOp(Node->getOperand(i))) {
3201      Ops.push_back(Node->getOperand(i));
3202      continue;
3203    }
3204
3205    SDLoc DL(Node);
3206    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
3207                                     Node->getOperand(i).getValueType(),
3208                                     Node->getOperand(i)), 0));
3209  }
3210
3211  DAG.UpdateNodeOperands(Node, Ops);
3212}
3213
3214/// \brief Fold the instructions after selecting them.
3215SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
3216                                          SelectionDAG &DAG) const {
3217  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3218  unsigned Opcode = Node->getMachineOpcode();
3219
3220  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
3221      !TII->isGather4(Opcode))
3222    adjustWritemask(Node, DAG);
3223
3224  if (Opcode == AMDGPU::INSERT_SUBREG ||
3225      Opcode == AMDGPU::REG_SEQUENCE) {
3226    legalizeTargetIndependentNode(Node, DAG);
3227    return Node;
3228  }
3229  return Node;
3230}
3231
3232/// \brief Assign the register class depending on the number of
3233/// bits set in the writemask
3234void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
3235                                                     SDNode *Node) const {
3236  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3237
3238  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3239
3240  if (TII->isVOP3(MI.getOpcode())) {
3241    // Make sure constant bus requirements are respected.
3242    TII->legalizeOperandsVOP3(MRI, MI);
3243    return;
3244  }
3245
3246  if (TII->isMIMG(MI)) {
3247    unsigned VReg = MI.getOperand(0).getReg();
3248    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
3249    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
3250    unsigned BitsSet = 0;
3251    for (unsigned i = 0; i < 4; ++i)
3252      BitsSet += Writemask & (1 << i) ? 1 : 0;
3253
3254    const TargetRegisterClass *RC;
3255    switch (BitsSet) {
3256    default: return;
3257    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
3258    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
3259    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
3260    }
3261
3262    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
3263    MI.setDesc(TII->get(NewOpcode));
3264    MRI.setRegClass(VReg, RC);
3265    return;
3266  }
3267
3268  // Replace unused atomics with the no return version.
3269  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
3270  if (NoRetAtomicOp != -1) {
3271    if (!Node->hasAnyUseOfValue(0)) {
3272      MI.setDesc(TII->get(NoRetAtomicOp));
3273      MI.RemoveOperand(0);
3274      return;
3275    }
3276
3277    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
3278    // instruction, because the return type of these instructions is a vec2 of
3279    // the memory type, so it can be tied to the input operand.
3280    // This means these instructions always have a use, so we need to add a
3281    // special case to check if the atomic has only one extract_subreg use,
3282    // which itself has no uses.
3283    if ((Node->hasNUsesOfValue(1, 0) &&
3284         Node->use_begin()->isMachineOpcode() &&
3285         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
3286         !Node->use_begin()->hasAnyUseOfValue(0))) {
3287      unsigned Def = MI.getOperand(0).getReg();
3288
3289      // Change this into a noret atomic.
3290      MI.setDesc(TII->get(NoRetAtomicOp));
3291      MI.RemoveOperand(0);
3292
3293      // If we only remove the def operand from the atomic instruction, the
3294      // extract_subreg will be left with a use of a vreg without a def.
3295      // So we need to insert an implicit_def to avoid machine verifier
3296      // errors.
3297      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3298              TII->get(AMDGPU::IMPLICIT_DEF), Def);
3299    }
3300    return;
3301  }
3302}
3303
3304static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
3305                              uint64_t Val) {
3306  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
3307  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
3308}
3309
3310MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
3311                                                const SDLoc &DL,
3312                                                SDValue Ptr) const {
3313  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3314
3315  // Build the half of the subregister with the constants before building the
3316  // full 128-bit register. If we are building multiple resource descriptors,
3317  // this will allow CSEing of the 2-component register.
3318  const SDValue Ops0[] = {
3319    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
3320    buildSMovImm32(DAG, DL, 0),
3321    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3322    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
3323    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
3324  };
3325
3326  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
3327                                                MVT::v2i32, Ops0), 0);
3328
3329  // Combine the constants and the pointer.
3330  const SDValue Ops1[] = {
3331    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3332    Ptr,
3333    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
3334    SubRegHi,
3335    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
3336  };
3337
3338  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
3339}
3340
3341/// \brief Return a resource descriptor with the 'Add TID' bit enabled
3342///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
3343///        of the resource descriptor) to create an offset, which is added to
3344///        the resource pointer.
3345MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
3346                                           SDValue Ptr, uint32_t RsrcDword1,
3347                                           uint64_t RsrcDword2And3) const {
3348  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
3349  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
3350  if (RsrcDword1) {
3351    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
3352                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
3353                    0);
3354  }
3355
3356  SDValue DataLo = buildSMovImm32(DAG, DL,
3357                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
3358  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
3359
3360  const SDValue Ops[] = {
3361    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3362    PtrLo,
3363    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3364    PtrHi,
3365    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
3366    DataLo,
3367    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
3368    DataHi,
3369    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
3370  };
3371
3372  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
3373}
3374
3375SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3376                                               const TargetRegisterClass *RC,
3377                                               unsigned Reg, EVT VT) const {
3378  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
3379
3380  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
3381                            cast<RegisterSDNode>(VReg)->getReg(), VT);
3382}
3383
3384//===----------------------------------------------------------------------===//
3385//                         SI Inline Assembly Support
3386//===----------------------------------------------------------------------===//
3387
3388std::pair<unsigned, const TargetRegisterClass *>
3389SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3390                                               StringRef Constraint,
3391                                               MVT VT) const {
3392
3393  if (Constraint.size() == 1) {
3394    switch (Constraint[0]) {
3395    case 's':
3396    case 'r':
3397      switch (VT.getSizeInBits()) {
3398      default:
3399        return std::make_pair(0U, nullptr);
3400      case 32:
3401        return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
3402      case 64:
3403        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
3404      case 128:
3405        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
3406      case 256:
3407        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
3408      }
3409
3410    case 'v':
3411      switch (VT.getSizeInBits()) {
3412      default:
3413        return std::make_pair(0U, nullptr);
3414      case 32:
3415        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
3416      case 64:
3417        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
3418      case 96:
3419        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
3420      case 128:
3421        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
3422      case 256:
3423        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
3424      case 512:
3425        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
3426      }
3427    }
3428  }
3429
3430  if (Constraint.size() > 1) {
3431    const TargetRegisterClass *RC = nullptr;
3432    if (Constraint[1] == 'v') {
3433      RC = &AMDGPU::VGPR_32RegClass;
3434    } else if (Constraint[1] == 's') {
3435      RC = &AMDGPU::SGPR_32RegClass;
3436    }
3437
3438    if (RC) {
3439      uint32_t Idx;
3440      bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
3441      if (!Failed && Idx < RC->getNumRegs())
3442        return std::make_pair(RC->getRegister(Idx), RC);
3443    }
3444  }
3445  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3446}
3447
3448SITargetLowering::ConstraintType
3449SITargetLowering::getConstraintType(StringRef Constraint) const {
3450  if (Constraint.size() == 1) {
3451    switch (Constraint[0]) {
3452    default: break;
3453    case 's':
3454    case 'v':
3455      return C_RegisterClass;
3456    }
3457  }
3458  return TargetLowering::getConstraintType(Constraint);
3459}
3460