AMDGPUISelLowering.cpp revision cd81d94322a39503e4a3e87b6ee03d4fcb3465fb
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This is the parent TargetLowering class for hardware code gen
12/// targets.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPUISelLowering.h"
17#include "AMDGPU.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPUIntrinsicInfo.h"
20#include "AMDGPURegisterInfo.h"
21#include "AMDGPUSubtarget.h"
22#include "R600MachineFunctionInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/Analysis/ValueTracking.h"
25#include "llvm/CodeGen/CallingConvLower.h"
26#include "llvm/CodeGen/MachineFunction.h"
27#include "llvm/CodeGen/MachineRegisterInfo.h"
28#include "llvm/CodeGen/SelectionDAG.h"
29#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
30#include "llvm/IR/DataLayout.h"
31#include "llvm/IR/DiagnosticInfo.h"
32#include "llvm/IR/DiagnosticPrinter.h"
33
34using namespace llvm;
35
36namespace {
37
38/// Diagnostic information for unimplemented or unsupported feature reporting.
39class DiagnosticInfoUnsupported : public DiagnosticInfo {
40private:
41  const Twine &Description;
42  const Function &Fn;
43
44  static int KindID;
45
46  static int getKindID() {
47    if (KindID == 0)
48      KindID = llvm::getNextAvailablePluginDiagnosticKind();
49    return KindID;
50  }
51
52public:
53  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
54                          DiagnosticSeverity Severity = DS_Error)
55    : DiagnosticInfo(getKindID(), Severity),
56      Description(Desc),
57      Fn(Fn) { }
58
59  const Function &getFunction() const { return Fn; }
60  const Twine &getDescription() const { return Description; }
61
62  void print(DiagnosticPrinter &DP) const override {
63    DP << "unsupported " << getDescription() << " in " << Fn.getName();
64  }
65
66  static bool classof(const DiagnosticInfo *DI) {
67    return DI->getKind() == getKindID();
68  }
69};
70
71int DiagnosticInfoUnsupported::KindID = 0;
72}
73
74
75static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
76                      CCValAssign::LocInfo LocInfo,
77                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
78  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
79                                        ArgFlags.getOrigAlign());
80  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
81
82  return true;
83}
84
85#include "AMDGPUGenCallingConv.inc"
86
87// Find a larger type to do a load / store of a vector with.
88EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
89  unsigned StoreSize = VT.getStoreSizeInBits();
90  if (StoreSize <= 32)
91    return EVT::getIntegerVT(Ctx, StoreSize);
92
93  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
94  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
95}
96
97// Type for a vector that will be loaded to.
98EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
99  unsigned StoreSize = VT.getStoreSizeInBits();
100  if (StoreSize <= 32)
101    return EVT::getIntegerVT(Ctx, 32);
102
103  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
104}
105
106AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
107  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
108
109  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
110
111  setOperationAction(ISD::Constant, MVT::i32, Legal);
112  setOperationAction(ISD::Constant, MVT::i64, Legal);
113  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
114  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
115
116  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
117  setOperationAction(ISD::BRIND, MVT::Other, Expand);
118
119  // We need to custom lower some of the intrinsics
120  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
121
122  // Library functions.  These default to Expand, but we have instructions
123  // for them.
124  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
125  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
126  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
127  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
128  setOperationAction(ISD::FABS,   MVT::f32, Legal);
129  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
130  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
131  setOperationAction(ISD::FROUND, MVT::f32, Legal);
132  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
133
134  // Lower floating point store/load to integer store/load to reduce the number
135  // of patterns in tablegen.
136  setOperationAction(ISD::STORE, MVT::f32, Promote);
137  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
138
139  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
140  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
141
142  setOperationAction(ISD::STORE, MVT::i64, Promote);
143  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
144
145  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
146  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
147
148  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
149  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
150
151  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
152  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
153
154  setOperationAction(ISD::STORE, MVT::f64, Promote);
155  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
156
157  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
158  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
159
160  // Custom lowering of vector stores is required for local address space
161  // stores.
162  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
163  // XXX: Native v2i32 local address space stores are possible, but not
164  // currently implemented.
165  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
166
167  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
168  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
169  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
170
171  // XXX: This can be change to Custom, once ExpandVectorStores can
172  // handle 64-bit stores.
173  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
174
175  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
177  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
178  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
179  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
180
181
182  setOperationAction(ISD::LOAD, MVT::f32, Promote);
183  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
184
185  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
186  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
187
188  setOperationAction(ISD::LOAD, MVT::i64, Promote);
189  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
190
191  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
192  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
193
194  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
195  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
196
197  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
198  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
199
200  setOperationAction(ISD::LOAD, MVT::f64, Promote);
201  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
202
203  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
204  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
205
206  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
207  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
208  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
209  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
210  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
211  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
212  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
213  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
214  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
215  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
216
217  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
218  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
219  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
220  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
221  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
222  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
223  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
224  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
225  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
226  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
227  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
228  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
229
230  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
231
232  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
233    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
234    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
235    setOperationAction(ISD::FRINT, MVT::f64, Custom);
236    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
237  }
238
239  if (!Subtarget->hasBFI()) {
240    // fcopysign can be done in a single instruction with BFI.
241    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
242    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
243  }
244
245  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
246  for (MVT VT : ScalarIntVTs) {
247    setOperationAction(ISD::SREM, VT, Expand);
248    setOperationAction(ISD::SDIV, VT, Expand);
249
250    // GPU does not have divrem function for signed or unsigned.
251    setOperationAction(ISD::SDIVREM, VT, Custom);
252    setOperationAction(ISD::UDIVREM, VT, Custom);
253
254    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
255    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
256    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
257
258    setOperationAction(ISD::BSWAP, VT, Expand);
259    setOperationAction(ISD::CTTZ, VT, Expand);
260    setOperationAction(ISD::CTLZ, VT, Expand);
261  }
262
263  if (!Subtarget->hasBCNT(32))
264    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
265
266  if (!Subtarget->hasBCNT(64))
267    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
268
269  // The hardware supports 32-bit ROTR, but not ROTL.
270  setOperationAction(ISD::ROTL, MVT::i32, Expand);
271  setOperationAction(ISD::ROTL, MVT::i64, Expand);
272  setOperationAction(ISD::ROTR, MVT::i64, Expand);
273
274  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
275  setOperationAction(ISD::MUL, MVT::i64, Expand);
276  setOperationAction(ISD::MULHU, MVT::i64, Expand);
277  setOperationAction(ISD::MULHS, MVT::i64, Expand);
278  setOperationAction(ISD::UDIV, MVT::i32, Expand);
279  setOperationAction(ISD::UREM, MVT::i32, Expand);
280  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
281  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
282
283  static const MVT::SimpleValueType VectorIntTypes[] = {
284    MVT::v2i32, MVT::v4i32
285  };
286
287  for (MVT VT : VectorIntTypes) {
288    // Expand the following operations for the current type by default.
289    setOperationAction(ISD::ADD,  VT, Expand);
290    setOperationAction(ISD::AND,  VT, Expand);
291    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
292    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
293    setOperationAction(ISD::MUL,  VT, Expand);
294    setOperationAction(ISD::OR,   VT, Expand);
295    setOperationAction(ISD::SHL,  VT, Expand);
296    setOperationAction(ISD::SRA,  VT, Expand);
297    setOperationAction(ISD::SRL,  VT, Expand);
298    setOperationAction(ISD::ROTL, VT, Expand);
299    setOperationAction(ISD::ROTR, VT, Expand);
300    setOperationAction(ISD::SUB,  VT, Expand);
301    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
302    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
303    // TODO: Implement custom UREM / SREM routines.
304    setOperationAction(ISD::SDIV, VT, Expand);
305    setOperationAction(ISD::UDIV, VT, Expand);
306    setOperationAction(ISD::SREM, VT, Expand);
307    setOperationAction(ISD::UREM, VT, Expand);
308    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
309    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
310    setOperationAction(ISD::SDIVREM, VT, Custom);
311    setOperationAction(ISD::UDIVREM, VT, Custom);
312    setOperationAction(ISD::ADDC, VT, Expand);
313    setOperationAction(ISD::SUBC, VT, Expand);
314    setOperationAction(ISD::ADDE, VT, Expand);
315    setOperationAction(ISD::SUBE, VT, Expand);
316    setOperationAction(ISD::SELECT, VT, Expand);
317    setOperationAction(ISD::VSELECT, VT, Expand);
318    setOperationAction(ISD::SELECT_CC, VT, Expand);
319    setOperationAction(ISD::XOR,  VT, Expand);
320    setOperationAction(ISD::BSWAP, VT, Expand);
321    setOperationAction(ISD::CTPOP, VT, Expand);
322    setOperationAction(ISD::CTTZ, VT, Expand);
323    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
324    setOperationAction(ISD::CTLZ, VT, Expand);
325    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
326    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
327  }
328
329  static const MVT::SimpleValueType FloatVectorTypes[] = {
330    MVT::v2f32, MVT::v4f32
331  };
332
333  for (MVT VT : FloatVectorTypes) {
334    setOperationAction(ISD::FABS, VT, Expand);
335    setOperationAction(ISD::FADD, VT, Expand);
336    setOperationAction(ISD::FCEIL, VT, Expand);
337    setOperationAction(ISD::FCOS, VT, Expand);
338    setOperationAction(ISD::FDIV, VT, Expand);
339    setOperationAction(ISD::FEXP2, VT, Expand);
340    setOperationAction(ISD::FLOG2, VT, Expand);
341    setOperationAction(ISD::FPOW, VT, Expand);
342    setOperationAction(ISD::FFLOOR, VT, Expand);
343    setOperationAction(ISD::FTRUNC, VT, Expand);
344    setOperationAction(ISD::FMUL, VT, Expand);
345    setOperationAction(ISD::FMA, VT, Expand);
346    setOperationAction(ISD::FRINT, VT, Expand);
347    setOperationAction(ISD::FNEARBYINT, VT, Expand);
348    setOperationAction(ISD::FSQRT, VT, Expand);
349    setOperationAction(ISD::FSIN, VT, Expand);
350    setOperationAction(ISD::FSUB, VT, Expand);
351    setOperationAction(ISD::FNEG, VT, Expand);
352    setOperationAction(ISD::SELECT, VT, Expand);
353    setOperationAction(ISD::VSELECT, VT, Expand);
354    setOperationAction(ISD::SELECT_CC, VT, Expand);
355    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
356    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
357  }
358
359  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
360  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
361
362  setTargetDAGCombine(ISD::MUL);
363  setTargetDAGCombine(ISD::SELECT_CC);
364
365  setSchedulingPreference(Sched::RegPressure);
366  setJumpIsExpensive(true);
367
368  setSelectIsExpensive(false);
369  PredictableSelectIsExpensive = false;
370
371  // There are no integer divide instructions, and these expand to a pretty
372  // large sequence of instructions.
373  setIntDivIsCheap(false);
374  setPow2DivIsCheap(false);
375
376  // TODO: Investigate this when 64-bit divides are implemented.
377  addBypassSlowDiv(64, 32);
378
379  // FIXME: Need to really handle these.
380  MaxStoresPerMemcpy  = 4096;
381  MaxStoresPerMemmove = 4096;
382  MaxStoresPerMemset  = 4096;
383}
384
385//===----------------------------------------------------------------------===//
386// Target Information
387//===----------------------------------------------------------------------===//
388
389MVT AMDGPUTargetLowering::getVectorIdxTy() const {
390  return MVT::i32;
391}
392
393bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
394  return true;
395}
396
397// The backend supports 32 and 64 bit floating point immediates.
398// FIXME: Why are we reporting vectors of FP immediates as legal?
399bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
400  EVT ScalarVT = VT.getScalarType();
401  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
402}
403
404// We don't want to shrink f64 / f32 constants.
405bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
406  EVT ScalarVT = VT.getScalarType();
407  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
408}
409
410bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
411                                                   EVT CastTy) const {
412  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
413    return true;
414
415  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
416  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
417
418  return ((LScalarSize <= CastScalarSize) ||
419          (CastScalarSize >= 32) ||
420          (LScalarSize < 32));
421}
422
423//===---------------------------------------------------------------------===//
424// Target Properties
425//===---------------------------------------------------------------------===//
426
427bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
428  assert(VT.isFloatingPoint());
429  return VT == MVT::f32;
430}
431
432bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
433  assert(VT.isFloatingPoint());
434  return VT == MVT::f32;
435}
436
437bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
438  // Truncate is just accessing a subregister.
439  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
440}
441
442bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
443  // Truncate is just accessing a subregister.
444  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
445         (Dest->getPrimitiveSizeInBits() % 32 == 0);
446}
447
448bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
449  const DataLayout *DL = getDataLayout();
450  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
451  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
452
453  return SrcSize == 32 && DestSize == 64;
454}
455
456bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
457  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
458  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
459  // this will enable reducing 64-bit operations the 32-bit, which is always
460  // good.
461  return Src == MVT::i32 && Dest == MVT::i64;
462}
463
464bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
465  return isZExtFree(Val.getValueType(), VT2);
466}
467
468bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
469  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
470  // limited number of native 64-bit operations. Shrinking an operation to fit
471  // in a single 32-bit register should always be helpful. As currently used,
472  // this is much less general than the name suggests, and is only used in
473  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
474  // not profitable, and may actually be harmful.
475  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
476}
477
478//===---------------------------------------------------------------------===//
479// TargetLowering Callbacks
480//===---------------------------------------------------------------------===//
481
482void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
483                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
484
485  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
486}
487
488SDValue AMDGPUTargetLowering::LowerReturn(
489                                     SDValue Chain,
490                                     CallingConv::ID CallConv,
491                                     bool isVarArg,
492                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
493                                     const SmallVectorImpl<SDValue> &OutVals,
494                                     SDLoc DL, SelectionDAG &DAG) const {
495  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
496}
497
498//===---------------------------------------------------------------------===//
499// Target specific lowering
500//===---------------------------------------------------------------------===//
501
502SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
503                                        SmallVectorImpl<SDValue> &InVals) const {
504  SDValue Callee = CLI.Callee;
505  SelectionDAG &DAG = CLI.DAG;
506
507  const Function &Fn = *DAG.getMachineFunction().getFunction();
508
509  StringRef FuncName("<unknown>");
510
511  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
512    FuncName = G->getSymbol();
513  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
514    FuncName = G->getGlobal()->getName();
515
516  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
517  DAG.getContext()->diagnose(NoCalls);
518  return SDValue();
519}
520
521SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
522                                             SelectionDAG &DAG) const {
523  switch (Op.getOpcode()) {
524  default:
525    Op.getNode()->dump();
526    llvm_unreachable("Custom lowering code for this"
527                     "instruction is not implemented yet!");
528    break;
529  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
530  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
531  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
532  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
533  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
534  case ISD::SDIV: return LowerSDIV(Op, DAG);
535  case ISD::SREM: return LowerSREM(Op, DAG);
536  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
537  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
538  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
539  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
540  case ISD::FRINT: return LowerFRINT(Op, DAG);
541  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
542  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
543  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
544  }
545  return Op;
546}
547
548void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
549                                              SmallVectorImpl<SDValue> &Results,
550                                              SelectionDAG &DAG) const {
551  switch (N->getOpcode()) {
552  case ISD::SIGN_EXTEND_INREG:
553    // Different parts of legalization seem to interpret which type of
554    // sign_extend_inreg is the one to check for custom lowering. The extended
555    // from type is what really matters, but some places check for custom
556    // lowering of the result type. This results in trying to use
557    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
558    // nothing here and let the illegal result integer be handled normally.
559    return;
560  case ISD::LOAD: {
561    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
562    if (!Node)
563      return;
564
565    Results.push_back(SDValue(Node, 0));
566    Results.push_back(SDValue(Node, 1));
567    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
568    // function
569    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
570    return;
571  }
572  case ISD::STORE: {
573    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
574    if (Lowered.getNode())
575      Results.push_back(Lowered);
576    return;
577  }
578  default:
579    return;
580  }
581}
582
583// FIXME: This implements accesses to initialized globals in the constant
584// address space by copying them to private and accessing that. It does not
585// properly handle illegal types or vectors. The private vector loads are not
586// scalarized, and the illegal scalars hit an assertion. This technique will not
587// work well with large initializers, and this should eventually be
588// removed. Initialized globals should be placed into a data section that the
589// runtime will load into a buffer before the kernel is executed. Uses of the
590// global need to be replaced with a pointer loaded from an implicit kernel
591// argument into this buffer holding the copy of the data, which will remove the
592// need for any of this.
593SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
594                                                       const GlobalValue *GV,
595                                                       const SDValue &InitPtr,
596                                                       SDValue Chain,
597                                                       SelectionDAG &DAG) const {
598  const DataLayout *TD = getTargetMachine().getDataLayout();
599  SDLoc DL(InitPtr);
600  Type *InitTy = Init->getType();
601
602  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
603    EVT VT = EVT::getEVT(InitTy);
604    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
605    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
606                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
607                        TD->getPrefTypeAlignment(InitTy));
608  }
609
610  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
611    EVT VT = EVT::getEVT(CFP->getType());
612    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
613    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
614                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
615                 TD->getPrefTypeAlignment(CFP->getType()));
616  }
617
618  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
619    const StructLayout *SL = TD->getStructLayout(ST);
620
621    EVT PtrVT = InitPtr.getValueType();
622    SmallVector<SDValue, 8> Chains;
623
624    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
625      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
626      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
627
628      Constant *Elt = Init->getAggregateElement(I);
629      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
630    }
631
632    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
633  }
634
635  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
636    EVT PtrVT = InitPtr.getValueType();
637
638    unsigned NumElements;
639    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
640      NumElements = AT->getNumElements();
641    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
642      NumElements = VT->getNumElements();
643    else
644      llvm_unreachable("Unexpected type");
645
646    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
647    SmallVector<SDValue, 8> Chains;
648    for (unsigned i = 0; i < NumElements; ++i) {
649      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
650      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
651
652      Constant *Elt = Init->getAggregateElement(i);
653      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
654    }
655
656    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
657  }
658
659  if (isa<UndefValue>(Init)) {
660    EVT VT = EVT::getEVT(InitTy);
661    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
662    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
663                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
664                        TD->getPrefTypeAlignment(InitTy));
665  }
666
667  Init->dump();
668  llvm_unreachable("Unhandled constant initializer");
669}
670
671SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
672                                                 SDValue Op,
673                                                 SelectionDAG &DAG) const {
674
675  const DataLayout *TD = getTargetMachine().getDataLayout();
676  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
677  const GlobalValue *GV = G->getGlobal();
678
679  switch (G->getAddressSpace()) {
680  default: llvm_unreachable("Global Address lowering not implemented for this "
681                            "address space");
682  case AMDGPUAS::LOCAL_ADDRESS: {
683    // XXX: What does the value of G->getOffset() mean?
684    assert(G->getOffset() == 0 &&
685         "Do not know what to do with an non-zero offset");
686
687    unsigned Offset;
688    if (MFI->LocalMemoryObjects.count(GV) == 0) {
689      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
690      Offset = MFI->LDSSize;
691      MFI->LocalMemoryObjects[GV] = Offset;
692      // XXX: Account for alignment?
693      MFI->LDSSize += Size;
694    } else {
695      Offset = MFI->LocalMemoryObjects[GV];
696    }
697
698    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
699  }
700  case AMDGPUAS::CONSTANT_ADDRESS: {
701    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
702    Type *EltType = GV->getType()->getElementType();
703    unsigned Size = TD->getTypeAllocSize(EltType);
704    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
705
706    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
707    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
708
709    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
710    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
711
712    const GlobalVariable *Var = cast<GlobalVariable>(GV);
713    if (!Var->hasInitializer()) {
714      // This has no use, but bugpoint will hit it.
715      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
716    }
717
718    const Constant *Init = Var->getInitializer();
719    SmallVector<SDNode*, 8> WorkList;
720
721    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
722                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
723      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
724        continue;
725      WorkList.push_back(*I);
726    }
727    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
728    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
729                                           E = WorkList.end(); I != E; ++I) {
730      SmallVector<SDValue, 8> Ops;
731      Ops.push_back(Chain);
732      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
733        Ops.push_back((*I)->getOperand(i));
734      }
735      DAG.UpdateNodeOperands(*I, Ops);
736    }
737    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
738  }
739  }
740}
741
742SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
743                                                  SelectionDAG &DAG) const {
744  SmallVector<SDValue, 8> Args;
745  SDValue A = Op.getOperand(0);
746  SDValue B = Op.getOperand(1);
747
748  DAG.ExtractVectorElements(A, Args);
749  DAG.ExtractVectorElements(B, Args);
750
751  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
752}
753
754SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
755                                                     SelectionDAG &DAG) const {
756
757  SmallVector<SDValue, 8> Args;
758  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
759  EVT VT = Op.getValueType();
760  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
761                            VT.getVectorNumElements());
762
763  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
764}
765
766SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
767                                              SelectionDAG &DAG) const {
768
769  MachineFunction &MF = DAG.getMachineFunction();
770  const AMDGPUFrameLowering *TFL =
771   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
772
773  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
774
775  unsigned FrameIndex = FIN->getIndex();
776  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
777  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
778                         Op.getValueType());
779}
780
781SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
782    SelectionDAG &DAG) const {
783  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
784  SDLoc DL(Op);
785  EVT VT = Op.getValueType();
786
787  switch (IntrinsicID) {
788    default: return Op;
789    case AMDGPUIntrinsic::AMDGPU_abs:
790    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
791      return LowerIntrinsicIABS(Op, DAG);
792    case AMDGPUIntrinsic::AMDGPU_lrp:
793      return LowerIntrinsicLRP(Op, DAG);
794    case AMDGPUIntrinsic::AMDGPU_fract:
795    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
796      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
797
798    case AMDGPUIntrinsic::AMDGPU_clamp:
799    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
800      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
801                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
802
803    case Intrinsic::AMDGPU_div_scale: {
804      // 3rd parameter required to be a constant.
805      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
806      if (!Param)
807        return DAG.getUNDEF(VT);
808
809      // Translate to the operands expected by the machine instruction. The
810      // first parameter must be the same as the first instruction.
811      SDValue Numerator = Op.getOperand(1);
812      SDValue Denominator = Op.getOperand(2);
813      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
814
815      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
816                         Src0, Denominator, Numerator);
817    }
818
819    case Intrinsic::AMDGPU_div_fmas:
820      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
821                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
822
823    case Intrinsic::AMDGPU_div_fixup:
824      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
825                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
826
827    case Intrinsic::AMDGPU_trig_preop:
828      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
829                         Op.getOperand(1), Op.getOperand(2));
830
831    case Intrinsic::AMDGPU_rcp:
832      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
833
834    case Intrinsic::AMDGPU_rsq:
835      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
836
837    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
838      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
839
840    case Intrinsic::AMDGPU_rsq_clamped:
841      return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
842
843    case AMDGPUIntrinsic::AMDGPU_imax:
844      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
845                                                  Op.getOperand(2));
846    case AMDGPUIntrinsic::AMDGPU_umax:
847      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
848                                                  Op.getOperand(2));
849    case AMDGPUIntrinsic::AMDGPU_imin:
850      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
851                                                  Op.getOperand(2));
852    case AMDGPUIntrinsic::AMDGPU_umin:
853      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
854                                                  Op.getOperand(2));
855
856    case AMDGPUIntrinsic::AMDGPU_umul24:
857      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
858                         Op.getOperand(1), Op.getOperand(2));
859
860    case AMDGPUIntrinsic::AMDGPU_imul24:
861      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
862                         Op.getOperand(1), Op.getOperand(2));
863
864    case AMDGPUIntrinsic::AMDGPU_umad24:
865      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
866                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
867
868    case AMDGPUIntrinsic::AMDGPU_imad24:
869      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
870                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
871
872    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
873      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
874
875    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
876      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
877
878    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
879      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
880
881    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
882      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
883
884    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
885      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
886                         Op.getOperand(1),
887                         Op.getOperand(2),
888                         Op.getOperand(3));
889
890    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
891      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
892                         Op.getOperand(1),
893                         Op.getOperand(2),
894                         Op.getOperand(3));
895
896    case AMDGPUIntrinsic::AMDGPU_bfi:
897      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
898                         Op.getOperand(1),
899                         Op.getOperand(2),
900                         Op.getOperand(3));
901
902    case AMDGPUIntrinsic::AMDGPU_bfm:
903      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
904                         Op.getOperand(1),
905                         Op.getOperand(2));
906
907    case AMDGPUIntrinsic::AMDGPU_brev:
908      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
909
910    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
911      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
912
913    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
914      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
915    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
916      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
917  }
918}
919
920///IABS(a) = SMAX(sub(0, a), a)
921SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
922                                                 SelectionDAG &DAG) const {
923  SDLoc DL(Op);
924  EVT VT = Op.getValueType();
925  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
926                                              Op.getOperand(1));
927
928  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
929}
930
931/// Linear Interpolation
932/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
933SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
934                                                SelectionDAG &DAG) const {
935  SDLoc DL(Op);
936  EVT VT = Op.getValueType();
937  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
938                                DAG.getConstantFP(1.0f, MVT::f32),
939                                Op.getOperand(1));
940  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
941                                                    Op.getOperand(3));
942  return DAG.getNode(ISD::FADD, DL, VT,
943      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
944      OneSubAC);
945}
946
947/// \brief Generate Min/Max node
948SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
949                                            SelectionDAG &DAG) const {
950  SDLoc DL(N);
951  EVT VT = N->getValueType(0);
952
953  SDValue LHS = N->getOperand(0);
954  SDValue RHS = N->getOperand(1);
955  SDValue True = N->getOperand(2);
956  SDValue False = N->getOperand(3);
957  SDValue CC = N->getOperand(4);
958
959  if (VT != MVT::f32 ||
960      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
961    return SDValue();
962  }
963
964  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
965  switch (CCOpcode) {
966  case ISD::SETOEQ:
967  case ISD::SETONE:
968  case ISD::SETUNE:
969  case ISD::SETNE:
970  case ISD::SETUEQ:
971  case ISD::SETEQ:
972  case ISD::SETFALSE:
973  case ISD::SETFALSE2:
974  case ISD::SETTRUE:
975  case ISD::SETTRUE2:
976  case ISD::SETUO:
977  case ISD::SETO:
978    llvm_unreachable("Operation should already be optimised!");
979  case ISD::SETULE:
980  case ISD::SETULT:
981  case ISD::SETOLE:
982  case ISD::SETOLT:
983  case ISD::SETLE:
984  case ISD::SETLT: {
985    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
986    return DAG.getNode(Opc, DL, VT, LHS, RHS);
987  }
988  case ISD::SETGT:
989  case ISD::SETGE:
990  case ISD::SETUGE:
991  case ISD::SETOGE:
992  case ISD::SETUGT:
993  case ISD::SETOGT: {
994    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
995    return DAG.getNode(Opc, DL, VT, LHS, RHS);
996  }
997  case ISD::SETCC_INVALID:
998    llvm_unreachable("Invalid setcc condcode!");
999  }
1000  return SDValue();
1001}
1002
1003SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
1004                                              SelectionDAG &DAG) const {
1005  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
1006  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
1007  EVT LoadVT = Op.getValueType();
1008  EVT EltVT = Op.getValueType().getVectorElementType();
1009  EVT PtrVT = Load->getBasePtr().getValueType();
1010
1011  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
1012  SmallVector<SDValue, 8> Loads;
1013  SmallVector<SDValue, 8> Chains;
1014
1015  SDLoc SL(Op);
1016
1017  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1018    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
1019                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
1020
1021    SDValue NewLoad
1022      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
1023                       Load->getChain(), Ptr,
1024                       MachinePointerInfo(Load->getMemOperand()->getValue()),
1025                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
1026                       Load->getAlignment());
1027    Loads.push_back(NewLoad.getValue(0));
1028    Chains.push_back(NewLoad.getValue(1));
1029  }
1030
1031  SDValue Ops[] = {
1032    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
1033    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
1034  };
1035
1036  return DAG.getMergeValues(Ops, SL);
1037}
1038
1039SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
1040                                               SelectionDAG &DAG) const {
1041  StoreSDNode *Store = cast<StoreSDNode>(Op);
1042  EVT MemVT = Store->getMemoryVT();
1043  unsigned MemBits = MemVT.getSizeInBits();
1044
1045  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
1046  // truncating store into an i32 store.
1047  // XXX: We could also handle optimize other vector bitwidths.
1048  if (!MemVT.isVector() || MemBits > 32) {
1049    return SDValue();
1050  }
1051
1052  SDLoc DL(Op);
1053  SDValue Value = Store->getValue();
1054  EVT VT = Value.getValueType();
1055  EVT ElemVT = VT.getVectorElementType();
1056  SDValue Ptr = Store->getBasePtr();
1057  EVT MemEltVT = MemVT.getVectorElementType();
1058  unsigned MemEltBits = MemEltVT.getSizeInBits();
1059  unsigned MemNumElements = MemVT.getVectorNumElements();
1060  unsigned PackedSize = MemVT.getStoreSizeInBits();
1061  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
1062
1063  assert(Value.getValueType().getScalarSizeInBits() >= 32);
1064
1065  SDValue PackedValue;
1066  for (unsigned i = 0; i < MemNumElements; ++i) {
1067    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
1068                              DAG.getConstant(i, MVT::i32));
1069    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
1070    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
1071
1072    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
1073    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
1074
1075    if (i == 0) {
1076      PackedValue = Elt;
1077    } else {
1078      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
1079    }
1080  }
1081
1082  if (PackedSize < 32) {
1083    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
1084    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
1085                             Store->getMemOperand()->getPointerInfo(),
1086                             PackedVT,
1087                             Store->isNonTemporal(), Store->isVolatile(),
1088                             Store->getAlignment());
1089  }
1090
1091  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
1092                      Store->getMemOperand()->getPointerInfo(),
1093                      Store->isVolatile(),  Store->isNonTemporal(),
1094                      Store->getAlignment());
1095}
1096
1097SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1098                                            SelectionDAG &DAG) const {
1099  StoreSDNode *Store = cast<StoreSDNode>(Op);
1100  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
1101  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
1102  EVT PtrVT = Store->getBasePtr().getValueType();
1103  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
1104  SDLoc SL(Op);
1105
1106  SmallVector<SDValue, 8> Chains;
1107
1108  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1109    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
1110                              Store->getValue(), DAG.getConstant(i, MVT::i32));
1111    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
1112                              Store->getBasePtr(),
1113                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
1114                                            PtrVT));
1115    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
1116                         MachinePointerInfo(Store->getMemOperand()->getValue()),
1117                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
1118                         Store->getAlignment()));
1119  }
1120  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
1121}
1122
1123SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1124  SDLoc DL(Op);
1125  LoadSDNode *Load = cast<LoadSDNode>(Op);
1126  ISD::LoadExtType ExtType = Load->getExtensionType();
1127  EVT VT = Op.getValueType();
1128  EVT MemVT = Load->getMemoryVT();
1129
1130  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
1131    // We can do the extload to 32-bits, and then need to separately extend to
1132    // 64-bits.
1133
1134    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
1135                                       Load->getChain(),
1136                                       Load->getBasePtr(),
1137                                       MemVT,
1138                                       Load->getMemOperand());
1139
1140    SDValue Ops[] = {
1141      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
1142      ExtLoad32.getValue(1)
1143    };
1144
1145    return DAG.getMergeValues(Ops, DL);
1146  }
1147
1148  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
1149    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
1150    // FIXME: Copied from PPC
1151    // First, load into 32 bits, then truncate to 1 bit.
1152
1153    SDValue Chain = Load->getChain();
1154    SDValue BasePtr = Load->getBasePtr();
1155    MachineMemOperand *MMO = Load->getMemOperand();
1156
1157    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
1158                                   BasePtr, MVT::i8, MMO);
1159
1160    SDValue Ops[] = {
1161      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
1162      NewLD.getValue(1)
1163    };
1164
1165    return DAG.getMergeValues(Ops, DL);
1166  }
1167
1168  // Lower loads constant address space global variable loads
1169  if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1170      isa<GlobalVariable>(
1171          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
1172
1173
1174    SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
1175        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1176    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1177        DAG.getConstant(2, MVT::i32));
1178    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1179                       Load->getChain(), Ptr,
1180                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1181  }
1182
1183  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
1184      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
1185    return SDValue();
1186
1187
1188  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1189                            DAG.getConstant(2, MVT::i32));
1190  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1191                            Load->getChain(), Ptr,
1192                            DAG.getTargetConstant(0, MVT::i32),
1193                            Op.getOperand(2));
1194  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1195                                Load->getBasePtr(),
1196                                DAG.getConstant(0x3, MVT::i32));
1197  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1198                                 DAG.getConstant(3, MVT::i32));
1199
1200  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1201
1202  EVT MemEltVT = MemVT.getScalarType();
1203  if (ExtType == ISD::SEXTLOAD) {
1204    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1205
1206    SDValue Ops[] = {
1207      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1208      Load->getChain()
1209    };
1210
1211    return DAG.getMergeValues(Ops, DL);
1212  }
1213
1214  SDValue Ops[] = {
1215    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1216    Load->getChain()
1217  };
1218
1219  return DAG.getMergeValues(Ops, DL);
1220}
1221
1222SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1223  SDLoc DL(Op);
1224  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
1225  if (Result.getNode()) {
1226    return Result;
1227  }
1228
1229  StoreSDNode *Store = cast<StoreSDNode>(Op);
1230  SDValue Chain = Store->getChain();
1231  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1232       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
1233      Store->getValue().getValueType().isVector()) {
1234    return SplitVectorStore(Op, DAG);
1235  }
1236
1237  EVT MemVT = Store->getMemoryVT();
1238  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1239      MemVT.bitsLT(MVT::i32)) {
1240    unsigned Mask = 0;
1241    if (Store->getMemoryVT() == MVT::i8) {
1242      Mask = 0xff;
1243    } else if (Store->getMemoryVT() == MVT::i16) {
1244      Mask = 0xffff;
1245    }
1246    SDValue BasePtr = Store->getBasePtr();
1247    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1248                              DAG.getConstant(2, MVT::i32));
1249    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1250                              Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
1251
1252    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1253                                  DAG.getConstant(0x3, MVT::i32));
1254
1255    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1256                                   DAG.getConstant(3, MVT::i32));
1257
1258    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1259                                    Store->getValue());
1260
1261    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1262
1263    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1264                                       MaskedValue, ShiftAmt);
1265
1266    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
1267                                  ShiftAmt);
1268    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1269                          DAG.getConstant(0xffffffff, MVT::i32));
1270    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1271
1272    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1273    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1274                       Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
1275  }
1276  return SDValue();
1277}
1278
1279SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
1280  SDLoc DL(Op);
1281  EVT OVT = Op.getValueType();
1282  SDValue LHS = Op.getOperand(0);
1283  SDValue RHS = Op.getOperand(1);
1284  MVT INTTY;
1285  MVT FLTTY;
1286  if (!OVT.isVector()) {
1287    INTTY = MVT::i32;
1288    FLTTY = MVT::f32;
1289  } else if (OVT.getVectorNumElements() == 2) {
1290    INTTY = MVT::v2i32;
1291    FLTTY = MVT::v2f32;
1292  } else if (OVT.getVectorNumElements() == 4) {
1293    INTTY = MVT::v4i32;
1294    FLTTY = MVT::v4f32;
1295  }
1296  unsigned bitsize = OVT.getScalarType().getSizeInBits();
1297  // char|short jq = ia ^ ib;
1298  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
1299
1300  // jq = jq >> (bitsize - 2)
1301  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
1302
1303  // jq = jq | 0x1
1304  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
1305
1306  // jq = (int)jq
1307  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
1308
1309  // int ia = (int)LHS;
1310  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
1311
1312  // int ib, (int)RHS;
1313  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
1314
1315  // float fa = (float)ia;
1316  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
1317
1318  // float fb = (float)ib;
1319  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
1320
1321  // float fq = native_divide(fa, fb);
1322  SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
1323                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
1324
1325  // fq = trunc(fq);
1326  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
1327
1328  // float fqneg = -fq;
1329  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
1330
1331  // float fr = mad(fqneg, fb, fa);
1332  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
1333      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
1334
1335  // int iq = (int)fq;
1336  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
1337
1338  // fr = fabs(fr);
1339  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
1340
1341  // fb = fabs(fb);
1342  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
1343
1344  // int cv = fr >= fb;
1345  SDValue cv;
1346  if (INTTY == MVT::i32) {
1347    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
1348  } else {
1349    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
1350  }
1351  // jq = (cv ? jq : 0);
1352  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
1353      DAG.getConstant(0, OVT));
1354  // dst = iq + jq;
1355  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
1356  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
1357  return iq;
1358}
1359
1360SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
1361  SDLoc DL(Op);
1362  EVT OVT = Op.getValueType();
1363  SDValue LHS = Op.getOperand(0);
1364  SDValue RHS = Op.getOperand(1);
1365  // The LowerSDIV32 function generates equivalent to the following IL.
1366  // mov r0, LHS
1367  // mov r1, RHS
1368  // ilt r10, r0, 0
1369  // ilt r11, r1, 0
1370  // iadd r0, r0, r10
1371  // iadd r1, r1, r11
1372  // ixor r0, r0, r10
1373  // ixor r1, r1, r11
1374  // udiv r0, r0, r1
1375  // ixor r10, r10, r11
1376  // iadd r0, r0, r10
1377  // ixor DST, r0, r10
1378
1379  // mov r0, LHS
1380  SDValue r0 = LHS;
1381
1382  // mov r1, RHS
1383  SDValue r1 = RHS;
1384
1385  // ilt r10, r0, 0
1386  SDValue r10 = DAG.getSelectCC(DL,
1387      r0, DAG.getConstant(0, OVT),
1388      DAG.getConstant(-1, OVT),
1389      DAG.getConstant(0, OVT),
1390      ISD::SETLT);
1391
1392  // ilt r11, r1, 0
1393  SDValue r11 = DAG.getSelectCC(DL,
1394      r1, DAG.getConstant(0, OVT),
1395      DAG.getConstant(-1, OVT),
1396      DAG.getConstant(0, OVT),
1397      ISD::SETLT);
1398
1399  // iadd r0, r0, r10
1400  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
1401
1402  // iadd r1, r1, r11
1403  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
1404
1405  // ixor r0, r0, r10
1406  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
1407
1408  // ixor r1, r1, r11
1409  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
1410
1411  // udiv r0, r0, r1
1412  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
1413
1414  // ixor r10, r10, r11
1415  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
1416
1417  // iadd r0, r0, r10
1418  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
1419
1420  // ixor DST, r0, r10
1421  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
1422  return DST;
1423}
1424
1425SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
1426  return SDValue(Op.getNode(), 0);
1427}
1428
1429SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
1430  EVT OVT = Op.getValueType().getScalarType();
1431
1432  if (OVT == MVT::i64)
1433    return LowerSDIV64(Op, DAG);
1434
1435  if (OVT.getScalarType() == MVT::i32)
1436    return LowerSDIV32(Op, DAG);
1437
1438  if (OVT == MVT::i16 || OVT == MVT::i8) {
1439    // FIXME: We should be checking for the masked bits. This isn't reached
1440    // because i8 and i16 are not legal types.
1441    return LowerSDIV24(Op, DAG);
1442  }
1443
1444  return SDValue(Op.getNode(), 0);
1445}
1446
1447SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
1448  SDLoc DL(Op);
1449  EVT OVT = Op.getValueType();
1450  SDValue LHS = Op.getOperand(0);
1451  SDValue RHS = Op.getOperand(1);
1452  // The LowerSREM32 function generates equivalent to the following IL.
1453  // mov r0, LHS
1454  // mov r1, RHS
1455  // ilt r10, r0, 0
1456  // ilt r11, r1, 0
1457  // iadd r0, r0, r10
1458  // iadd r1, r1, r11
1459  // ixor r0, r0, r10
1460  // ixor r1, r1, r11
1461  // udiv r20, r0, r1
1462  // umul r20, r20, r1
1463  // sub r0, r0, r20
1464  // iadd r0, r0, r10
1465  // ixor DST, r0, r10
1466
1467  // mov r0, LHS
1468  SDValue r0 = LHS;
1469
1470  // mov r1, RHS
1471  SDValue r1 = RHS;
1472
1473  // ilt r10, r0, 0
1474  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
1475
1476  // ilt r11, r1, 0
1477  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
1478
1479  // iadd r0, r0, r10
1480  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
1481
1482  // iadd r1, r1, r11
1483  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
1484
1485  // ixor r0, r0, r10
1486  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
1487
1488  // ixor r1, r1, r11
1489  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
1490
1491  // udiv r20, r0, r1
1492  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
1493
1494  // umul r20, r20, r1
1495  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
1496
1497  // sub r0, r0, r20
1498  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
1499
1500  // iadd r0, r0, r10
1501  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
1502
1503  // ixor DST, r0, r10
1504  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
1505  return DST;
1506}
1507
1508SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
1509  return SDValue(Op.getNode(), 0);
1510}
1511
1512SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
1513  EVT OVT = Op.getValueType();
1514
1515  if (OVT.getScalarType() == MVT::i64)
1516    return LowerSREM64(Op, DAG);
1517
1518  if (OVT.getScalarType() == MVT::i32)
1519    return LowerSREM32(Op, DAG);
1520
1521  return SDValue(Op.getNode(), 0);
1522}
1523
1524SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1525                                           SelectionDAG &DAG) const {
1526  SDLoc DL(Op);
1527  EVT VT = Op.getValueType();
1528
1529  SDValue Num = Op.getOperand(0);
1530  SDValue Den = Op.getOperand(1);
1531
1532  // RCP =  URECIP(Den) = 2^32 / Den + e
1533  // e is rounding error.
1534  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1535
1536  // RCP_LO = umulo(RCP, Den) */
1537  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
1538
1539  // RCP_HI = mulhu (RCP, Den) */
1540  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1541
1542  // NEG_RCP_LO = -RCP_LO
1543  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
1544                                                     RCP_LO);
1545
1546  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1547  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1548                                           NEG_RCP_LO, RCP_LO,
1549                                           ISD::SETEQ);
1550  // Calculate the rounding error from the URECIP instruction
1551  // E = mulhu(ABS_RCP_LO, RCP)
1552  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1553
1554  // RCP_A_E = RCP + E
1555  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1556
1557  // RCP_S_E = RCP - E
1558  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1559
1560  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1561  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1562                                     RCP_A_E, RCP_S_E,
1563                                     ISD::SETEQ);
1564  // Quotient = mulhu(Tmp0, Num)
1565  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1566
1567  // Num_S_Remainder = Quotient * Den
1568  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
1569
1570  // Remainder = Num - Num_S_Remainder
1571  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1572
1573  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1574  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1575                                                 DAG.getConstant(-1, VT),
1576                                                 DAG.getConstant(0, VT),
1577                                                 ISD::SETUGE);
1578  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1579  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1580                                                  Num_S_Remainder,
1581                                                  DAG.getConstant(-1, VT),
1582                                                  DAG.getConstant(0, VT),
1583                                                  ISD::SETUGE);
1584  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1585  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1586                                               Remainder_GE_Zero);
1587
1588  // Calculate Division result:
1589
1590  // Quotient_A_One = Quotient + 1
1591  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1592                                                         DAG.getConstant(1, VT));
1593
1594  // Quotient_S_One = Quotient - 1
1595  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1596                                                         DAG.getConstant(1, VT));
1597
1598  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1599  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1600                                     Quotient, Quotient_A_One, ISD::SETEQ);
1601
1602  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1603  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1604                            Quotient_S_One, Div, ISD::SETEQ);
1605
1606  // Calculate Rem result:
1607
1608  // Remainder_S_Den = Remainder - Den
1609  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1610
1611  // Remainder_A_Den = Remainder + Den
1612  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1613
1614  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1615  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1616                                    Remainder, Remainder_S_Den, ISD::SETEQ);
1617
1618  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1619  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1620                            Remainder_A_Den, Rem, ISD::SETEQ);
1621  SDValue Ops[2] = {
1622    Div,
1623    Rem
1624  };
1625  return DAG.getMergeValues(Ops, DL);
1626}
1627
1628SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1629                                           SelectionDAG &DAG) const {
1630  SDLoc DL(Op);
1631  EVT VT = Op.getValueType();
1632
1633  SDValue Zero = DAG.getConstant(0, VT);
1634  SDValue NegOne = DAG.getConstant(-1, VT);
1635
1636  SDValue LHS = Op.getOperand(0);
1637  SDValue RHS = Op.getOperand(1);
1638
1639  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1640  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1641  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1642  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1643
1644  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1645  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1646
1647  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1648  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1649
1650  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1651  SDValue Rem = Div.getValue(1);
1652
1653  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1654  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1655
1656  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1657  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1658
1659  SDValue Res[2] = {
1660    Div,
1661    Rem
1662  };
1663  return DAG.getMergeValues(Res, DL);
1664}
1665
1666SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1667  SDLoc SL(Op);
1668  SDValue Src = Op.getOperand(0);
1669
1670  // result = trunc(src)
1671  // if (src > 0.0 && src != result)
1672  //   result += 1.0
1673
1674  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1675
1676  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
1677  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
1678
1679  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
1680
1681  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1682  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1683  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1684
1685  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1686  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1687}
1688
1689SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1690  SDLoc SL(Op);
1691  SDValue Src = Op.getOperand(0);
1692
1693  assert(Op.getValueType() == MVT::f64);
1694
1695  const SDValue Zero = DAG.getConstant(0, MVT::i32);
1696  const SDValue One = DAG.getConstant(1, MVT::i32);
1697
1698  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1699
1700  // Extract the upper half, since this is where we will find the sign and
1701  // exponent.
1702  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1703
1704  const unsigned FractBits = 52;
1705  const unsigned ExpBits = 11;
1706
1707  // Extract the exponent.
1708  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
1709                                Hi,
1710                                DAG.getConstant(FractBits - 32, MVT::i32),
1711                                DAG.getConstant(ExpBits, MVT::i32));
1712  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1713                            DAG.getConstant(1023, MVT::i32));
1714
1715  // Extract the sign bit.
1716  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
1717  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1718
1719  // Extend back to to 64-bits.
1720  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
1721                                  Zero, SignBit);
1722  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1723
1724  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1725  const SDValue FractMask
1726    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
1727
1728  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1729  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1730  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1731
1732  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
1733
1734  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
1735
1736  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1737  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1738
1739  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1740  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1741
1742  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1743}
1744
1745SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1746  SDLoc SL(Op);
1747  SDValue Src = Op.getOperand(0);
1748
1749  assert(Op.getValueType() == MVT::f64);
1750
1751  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
1752  SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
1753  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1754
1755  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1756  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1757
1758  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1759
1760  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
1761  SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
1762
1763  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
1764  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1765
1766  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1767}
1768
1769SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1770  // FNEARBYINT and FRINT are the same, except in their handling of FP
1771  // exceptions. Those aren't really meaningful for us, and OpenCL only has
1772  // rint, so just treat them as equivalent.
1773  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1774}
1775
1776SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1777  SDLoc SL(Op);
1778  SDValue Src = Op.getOperand(0);
1779
1780  // result = trunc(src);
1781  // if (src < 0.0 && src != result)
1782  //   result += -1.0.
1783
1784  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1785
1786  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
1787  const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
1788
1789  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
1790
1791  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1792  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1793  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1794
1795  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1796  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1797}
1798
1799SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
1800                                               SelectionDAG &DAG) const {
1801  SDValue S0 = Op.getOperand(0);
1802  SDLoc DL(Op);
1803  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
1804    return SDValue();
1805
1806  // f32 uint_to_fp i64
1807  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
1808                           DAG.getConstant(0, MVT::i32));
1809  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
1810  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
1811                           DAG.getConstant(1, MVT::i32));
1812  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
1813  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
1814                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
1815  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
1816}
1817
1818SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
1819                                                      unsigned BitsDiff,
1820                                                      SelectionDAG &DAG) const {
1821  MVT VT = Op.getSimpleValueType();
1822  SDLoc DL(Op);
1823  SDValue Shift = DAG.getConstant(BitsDiff, VT);
1824  // Shift left by 'Shift' bits.
1825  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
1826  // Signed shift Right by 'Shift' bits.
1827  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
1828}
1829
1830SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
1831                                                     SelectionDAG &DAG) const {
1832  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
1833  MVT VT = Op.getSimpleValueType();
1834  MVT ScalarVT = VT.getScalarType();
1835
1836  if (!VT.isVector())
1837    return SDValue();
1838
1839  SDValue Src = Op.getOperand(0);
1840  SDLoc DL(Op);
1841
1842  // TODO: Don't scalarize on Evergreen?
1843  unsigned NElts = VT.getVectorNumElements();
1844  SmallVector<SDValue, 8> Args;
1845  DAG.ExtractVectorElements(Src, Args, 0, NElts);
1846
1847  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
1848  for (unsigned I = 0; I < NElts; ++I)
1849    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
1850
1851  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
1852}
1853
1854//===----------------------------------------------------------------------===//
1855// Custom DAG optimizations
1856//===----------------------------------------------------------------------===//
1857
1858static bool isU24(SDValue Op, SelectionDAG &DAG) {
1859  APInt KnownZero, KnownOne;
1860  EVT VT = Op.getValueType();
1861  DAG.computeKnownBits(Op, KnownZero, KnownOne);
1862
1863  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
1864}
1865
1866static bool isI24(SDValue Op, SelectionDAG &DAG) {
1867  EVT VT = Op.getValueType();
1868
1869  // In order for this to be a signed 24-bit value, bit 23, must
1870  // be a sign bit.
1871  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
1872                                     // as unsigned 24-bit values.
1873         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
1874}
1875
1876static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
1877
1878  SelectionDAG &DAG = DCI.DAG;
1879  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
1880  EVT VT = Op.getValueType();
1881
1882  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
1883  APInt KnownZero, KnownOne;
1884  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
1885  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
1886    DCI.CommitTargetLoweringOpt(TLO);
1887}
1888
1889template <typename IntTy>
1890static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
1891                               uint32_t Offset, uint32_t Width) {
1892  if (Width + Offset < 32) {
1893    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
1894    return DAG.getConstant(Result, MVT::i32);
1895  }
1896
1897  return DAG.getConstant(Src0 >> Offset, MVT::i32);
1898}
1899
1900SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
1901                                                DAGCombinerInfo &DCI) const {
1902  EVT VT = N->getValueType(0);
1903
1904  if (VT.isVector() || VT.getSizeInBits() > 32)
1905    return SDValue();
1906
1907  SelectionDAG &DAG = DCI.DAG;
1908  SDLoc DL(N);
1909
1910  SDValue N0 = N->getOperand(0);
1911  SDValue N1 = N->getOperand(1);
1912  SDValue Mul;
1913
1914  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
1915    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
1916    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
1917    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
1918  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
1919    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
1920    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
1921    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
1922  } else {
1923    return SDValue();
1924  }
1925
1926  // We need to use sext even for MUL_U24, because MUL_U24 is used
1927  // for signed multiply of 8 and 16-bit types.
1928  return DAG.getSExtOrTrunc(Mul, DL, VT);
1929}
1930
1931SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
1932                                            DAGCombinerInfo &DCI) const {
1933  SelectionDAG &DAG = DCI.DAG;
1934  SDLoc DL(N);
1935
1936  switch(N->getOpcode()) {
1937    default: break;
1938    case ISD::MUL:
1939      return performMulCombine(N, DCI);
1940    case AMDGPUISD::MUL_I24:
1941    case AMDGPUISD::MUL_U24: {
1942      SDValue N0 = N->getOperand(0);
1943      SDValue N1 = N->getOperand(1);
1944      simplifyI24(N0, DCI);
1945      simplifyI24(N1, DCI);
1946      return SDValue();
1947    }
1948    case ISD::SELECT_CC: {
1949      return CombineMinMax(N, DAG);
1950    }
1951  case AMDGPUISD::BFE_I32:
1952  case AMDGPUISD::BFE_U32: {
1953    assert(!N->getValueType(0).isVector() &&
1954           "Vector handling of BFE not implemented");
1955    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
1956    if (!Width)
1957      break;
1958
1959    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
1960    if (WidthVal == 0)
1961      return DAG.getConstant(0, MVT::i32);
1962
1963    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
1964    if (!Offset)
1965      break;
1966
1967    SDValue BitsFrom = N->getOperand(0);
1968    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
1969
1970    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
1971
1972    if (OffsetVal == 0) {
1973      // This is already sign / zero extended, so try to fold away extra BFEs.
1974      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
1975
1976      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
1977      if (OpSignBits >= SignBits)
1978        return BitsFrom;
1979
1980      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
1981      if (Signed) {
1982        // This is a sign_extend_inreg. Replace it to take advantage of existing
1983        // DAG Combines. If not eliminated, we will match back to BFE during
1984        // selection.
1985
1986        // TODO: The sext_inreg of extended types ends, although we can could
1987        // handle them in a single BFE.
1988        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
1989                           DAG.getValueType(SmallVT));
1990      }
1991
1992      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
1993    }
1994
1995    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
1996      if (Signed) {
1997        return constantFoldBFE<int32_t>(DAG,
1998                                        Val->getSExtValue(),
1999                                        OffsetVal,
2000                                        WidthVal);
2001      }
2002
2003      return constantFoldBFE<uint32_t>(DAG,
2004                                       Val->getZExtValue(),
2005                                       OffsetVal,
2006                                       WidthVal);
2007    }
2008
2009    APInt Demanded = APInt::getBitsSet(32,
2010                                       OffsetVal,
2011                                       OffsetVal + WidthVal);
2012
2013    if ((OffsetVal + WidthVal) >= 32) {
2014      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
2015      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
2016                         BitsFrom, ShiftVal);
2017    }
2018
2019    APInt KnownZero, KnownOne;
2020    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
2021                                          !DCI.isBeforeLegalizeOps());
2022    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2023    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
2024        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
2025      DCI.CommitTargetLoweringOpt(TLO);
2026    }
2027
2028    break;
2029  }
2030  }
2031  return SDValue();
2032}
2033
2034//===----------------------------------------------------------------------===//
2035// Helper functions
2036//===----------------------------------------------------------------------===//
2037
2038void AMDGPUTargetLowering::getOriginalFunctionArgs(
2039                               SelectionDAG &DAG,
2040                               const Function *F,
2041                               const SmallVectorImpl<ISD::InputArg> &Ins,
2042                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
2043
2044  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
2045    if (Ins[i].ArgVT == Ins[i].VT) {
2046      OrigIns.push_back(Ins[i]);
2047      continue;
2048    }
2049
2050    EVT VT;
2051    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
2052      // Vector has been split into scalars.
2053      VT = Ins[i].ArgVT.getVectorElementType();
2054    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
2055               Ins[i].ArgVT.getVectorElementType() !=
2056               Ins[i].VT.getVectorElementType()) {
2057      // Vector elements have been promoted
2058      VT = Ins[i].ArgVT;
2059    } else {
2060      // Vector has been spilt into smaller vectors.
2061      VT = Ins[i].VT;
2062    }
2063
2064    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
2065                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
2066    OrigIns.push_back(Arg);
2067  }
2068}
2069
2070bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
2071  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
2072    return CFP->isExactlyValue(1.0);
2073  }
2074  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
2075    return C->isAllOnesValue();
2076  }
2077  return false;
2078}
2079
2080bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
2081  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
2082    return CFP->getValueAPF().isZero();
2083  }
2084  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
2085    return C->isNullValue();
2086  }
2087  return false;
2088}
2089
2090SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
2091                                                  const TargetRegisterClass *RC,
2092                                                   unsigned Reg, EVT VT) const {
2093  MachineFunction &MF = DAG.getMachineFunction();
2094  MachineRegisterInfo &MRI = MF.getRegInfo();
2095  unsigned VirtualRegister;
2096  if (!MRI.isLiveIn(Reg)) {
2097    VirtualRegister = MRI.createVirtualRegister(RC);
2098    MRI.addLiveIn(Reg, VirtualRegister);
2099  } else {
2100    VirtualRegister = MRI.getLiveInVirtReg(Reg);
2101  }
2102  return DAG.getRegister(VirtualRegister, VT);
2103}
2104
2105#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
2106
2107const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
2108  switch (Opcode) {
2109  default: return nullptr;
2110  // AMDIL DAG nodes
2111  NODE_NAME_CASE(CALL);
2112  NODE_NAME_CASE(UMUL);
2113  NODE_NAME_CASE(RET_FLAG);
2114  NODE_NAME_CASE(BRANCH_COND);
2115
2116  // AMDGPU DAG nodes
2117  NODE_NAME_CASE(DWORDADDR)
2118  NODE_NAME_CASE(FRACT)
2119  NODE_NAME_CASE(CLAMP)
2120  NODE_NAME_CASE(FMAX)
2121  NODE_NAME_CASE(SMAX)
2122  NODE_NAME_CASE(UMAX)
2123  NODE_NAME_CASE(FMIN)
2124  NODE_NAME_CASE(SMIN)
2125  NODE_NAME_CASE(UMIN)
2126  NODE_NAME_CASE(URECIP)
2127  NODE_NAME_CASE(DIV_SCALE)
2128  NODE_NAME_CASE(DIV_FMAS)
2129  NODE_NAME_CASE(DIV_FIXUP)
2130  NODE_NAME_CASE(TRIG_PREOP)
2131  NODE_NAME_CASE(RCP)
2132  NODE_NAME_CASE(RSQ)
2133  NODE_NAME_CASE(RSQ_LEGACY)
2134  NODE_NAME_CASE(RSQ_CLAMPED)
2135  NODE_NAME_CASE(DOT4)
2136  NODE_NAME_CASE(BFE_U32)
2137  NODE_NAME_CASE(BFE_I32)
2138  NODE_NAME_CASE(BFI)
2139  NODE_NAME_CASE(BFM)
2140  NODE_NAME_CASE(BREV)
2141  NODE_NAME_CASE(MUL_U24)
2142  NODE_NAME_CASE(MUL_I24)
2143  NODE_NAME_CASE(MAD_U24)
2144  NODE_NAME_CASE(MAD_I24)
2145  NODE_NAME_CASE(EXPORT)
2146  NODE_NAME_CASE(CONST_ADDRESS)
2147  NODE_NAME_CASE(REGISTER_LOAD)
2148  NODE_NAME_CASE(REGISTER_STORE)
2149  NODE_NAME_CASE(LOAD_CONSTANT)
2150  NODE_NAME_CASE(LOAD_INPUT)
2151  NODE_NAME_CASE(SAMPLE)
2152  NODE_NAME_CASE(SAMPLEB)
2153  NODE_NAME_CASE(SAMPLED)
2154  NODE_NAME_CASE(SAMPLEL)
2155  NODE_NAME_CASE(CVT_F32_UBYTE0)
2156  NODE_NAME_CASE(CVT_F32_UBYTE1)
2157  NODE_NAME_CASE(CVT_F32_UBYTE2)
2158  NODE_NAME_CASE(CVT_F32_UBYTE3)
2159  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
2160  NODE_NAME_CASE(STORE_MSKOR)
2161  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
2162  }
2163}
2164
2165static void computeKnownBitsForMinMax(const SDValue Op0,
2166                                      const SDValue Op1,
2167                                      APInt &KnownZero,
2168                                      APInt &KnownOne,
2169                                      const SelectionDAG &DAG,
2170                                      unsigned Depth) {
2171  APInt Op0Zero, Op0One;
2172  APInt Op1Zero, Op1One;
2173  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
2174  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
2175
2176  KnownZero = Op0Zero & Op1Zero;
2177  KnownOne = Op0One & Op1One;
2178}
2179
2180void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
2181  const SDValue Op,
2182  APInt &KnownZero,
2183  APInt &KnownOne,
2184  const SelectionDAG &DAG,
2185  unsigned Depth) const {
2186
2187  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
2188
2189  APInt KnownZero2;
2190  APInt KnownOne2;
2191  unsigned Opc = Op.getOpcode();
2192
2193  switch (Opc) {
2194  default:
2195    break;
2196  case ISD::INTRINSIC_WO_CHAIN: {
2197    // FIXME: The intrinsic should just use the node.
2198    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
2199    case AMDGPUIntrinsic::AMDGPU_imax:
2200    case AMDGPUIntrinsic::AMDGPU_umax:
2201    case AMDGPUIntrinsic::AMDGPU_imin:
2202    case AMDGPUIntrinsic::AMDGPU_umin:
2203      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
2204                                KnownZero, KnownOne, DAG, Depth);
2205      break;
2206    default:
2207      break;
2208    }
2209
2210    break;
2211  }
2212  case AMDGPUISD::SMAX:
2213  case AMDGPUISD::UMAX:
2214  case AMDGPUISD::SMIN:
2215  case AMDGPUISD::UMIN:
2216    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
2217                              KnownZero, KnownOne, DAG, Depth);
2218    break;
2219
2220  case AMDGPUISD::BFE_I32:
2221  case AMDGPUISD::BFE_U32: {
2222    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2223    if (!CWidth)
2224      return;
2225
2226    unsigned BitWidth = 32;
2227    uint32_t Width = CWidth->getZExtValue() & 0x1f;
2228    if (Width == 0) {
2229      KnownZero = APInt::getAllOnesValue(BitWidth);
2230      KnownOne = APInt::getNullValue(BitWidth);
2231      return;
2232    }
2233
2234    // FIXME: This could do a lot more. If offset is 0, should be the same as
2235    // sign_extend_inreg implementation, but that involves duplicating it.
2236    if (Opc == AMDGPUISD::BFE_I32)
2237      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
2238    else
2239      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
2240
2241    break;
2242  }
2243  }
2244}
2245
2246unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
2247  SDValue Op,
2248  const SelectionDAG &DAG,
2249  unsigned Depth) const {
2250  switch (Op.getOpcode()) {
2251  case AMDGPUISD::BFE_I32: {
2252    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2253    if (!Width)
2254      return 1;
2255
2256    unsigned SignBits = 32 - Width->getZExtValue() + 1;
2257    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2258    if (!Offset || !Offset->isNullValue())
2259      return SignBits;
2260
2261    // TODO: Could probably figure something out with non-0 offsets.
2262    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
2263    return std::max(SignBits, Op0SignBits);
2264  }
2265
2266  case AMDGPUISD::BFE_U32: {
2267    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2268    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
2269  }
2270
2271  default:
2272    return 1;
2273  }
2274}
2275