1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "AMDGPUFrameLowering.h"
17#include "AMDGPUIntrinsicInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "R600Defines.h"
20#include "R600InstrInfo.h"
21#include "R600MachineFunctionInfo.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/IR/Argument.h"
29#include "llvm/IR/Function.h"
30
31using namespace llvm;
32
33R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
34                                       const R600Subtarget &STI)
35    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
38  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
39  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
40  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
41  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
42
43  computeRegisterProperties(STI.getRegisterInfo());
44
45  // Legalize loads and stores to the private address space.
46  setOperationAction(ISD::LOAD, MVT::i32, Custom);
47  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
48  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
49
50  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
51  // spaces, so it is custom lowered to handle those where it isn't.
52  for (MVT VT : MVT::integer_valuetypes()) {
53    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
54    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
55    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
56
57    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
58    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
59    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
60
61    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
62    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
63    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
64  }
65
66  // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
67  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
68  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
69  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
70
71  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
72  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
73  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
74
75
76  setOperationAction(ISD::STORE, MVT::i8, Custom);
77  setOperationAction(ISD::STORE, MVT::i32, Custom);
78  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
79  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
80
81  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
82  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
83
84  // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
85  setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
86  setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
87
88  // Set condition code actions
89  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
90  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
91  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
92  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
93  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
94  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
95  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
96  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
97  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
98  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
99  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
100  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
101
102  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
103  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
104  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
105  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
106
107  setOperationAction(ISD::FCOS, MVT::f32, Custom);
108  setOperationAction(ISD::FSIN, MVT::f32, Custom);
109
110  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
111  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
112
113  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
114  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
115  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
116
117  setOperationAction(ISD::FSUB, MVT::f32, Expand);
118
119  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
120  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
121
122  setOperationAction(ISD::SETCC, MVT::i32, Expand);
123  setOperationAction(ISD::SETCC, MVT::f32, Expand);
124  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
125  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
126  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
127
128  setOperationAction(ISD::SELECT, MVT::i32, Expand);
129  setOperationAction(ISD::SELECT, MVT::f32, Expand);
130  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
131  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
132
133  // ADD, SUB overflow.
134  // TODO: turn these into Legal?
135  if (Subtarget->hasCARRY())
136    setOperationAction(ISD::UADDO, MVT::i32, Custom);
137
138  if (Subtarget->hasBORROW())
139    setOperationAction(ISD::USUBO, MVT::i32, Custom);
140
141  // Expand sign extension of vectors
142  if (!Subtarget->hasBFE())
143    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
144
145  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
146  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
147
148  if (!Subtarget->hasBFE())
149    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
150  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
151  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
152
153  if (!Subtarget->hasBFE())
154    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
155  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
156  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
157
158  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
159  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
160  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
161
162  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
163
164  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
165
166  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
167  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
168  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
169  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
170
171  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
172  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
173  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
174  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
175
176  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
177  //  to be Legal/Custom in order to avoid library calls.
178  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
179  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
180  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
181
182  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
183
184  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
185  for (MVT VT : ScalarIntVTs) {
186    setOperationAction(ISD::ADDC, VT, Expand);
187    setOperationAction(ISD::SUBC, VT, Expand);
188    setOperationAction(ISD::ADDE, VT, Expand);
189    setOperationAction(ISD::SUBE, VT, Expand);
190  }
191
192  setSchedulingPreference(Sched::Source);
193
194
195  setTargetDAGCombine(ISD::FP_ROUND);
196  setTargetDAGCombine(ISD::FP_TO_SINT);
197  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
198  setTargetDAGCombine(ISD::SELECT_CC);
199  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
200}
201
202const R600Subtarget *R600TargetLowering::getSubtarget() const {
203  return static_cast<const R600Subtarget *>(Subtarget);
204}
205
206static inline bool isEOP(MachineBasicBlock::iterator I) {
207  return std::next(I)->getOpcode() == AMDGPU::RETURN;
208}
209
210MachineBasicBlock *
211R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
212                                                MachineBasicBlock *BB) const {
213  MachineFunction * MF = BB->getParent();
214  MachineRegisterInfo &MRI = MF->getRegInfo();
215  MachineBasicBlock::iterator I = MI;
216  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
217
218  switch (MI.getOpcode()) {
219  default:
220    // Replace LDS_*_RET instruction that don't have any uses with the
221    // equivalent LDS_*_NORET instruction.
222    if (TII->isLDSRetInstr(MI.getOpcode())) {
223      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
224      assert(DstIdx != -1);
225      MachineInstrBuilder NewMI;
226      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
227      //        LDS_1A2D support and remove this special case.
228      if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
229          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
230        return BB;
231
232      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
233                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
234      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
235        NewMI.addOperand(MI.getOperand(i));
236      }
237    } else {
238      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
239    }
240    break;
241  case AMDGPU::CLAMP_R600: {
242    MachineInstr *NewMI = TII->buildDefaultInstruction(
243        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
244        MI.getOperand(1).getReg());
245    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
246    break;
247  }
248
249  case AMDGPU::FABS_R600: {
250    MachineInstr *NewMI = TII->buildDefaultInstruction(
251        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
252        MI.getOperand(1).getReg());
253    TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
254    break;
255  }
256
257  case AMDGPU::FNEG_R600: {
258    MachineInstr *NewMI = TII->buildDefaultInstruction(
259        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
260        MI.getOperand(1).getReg());
261    TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
262    break;
263  }
264
265  case AMDGPU::MASK_WRITE: {
266    unsigned maskedRegister = MI.getOperand(0).getReg();
267    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
268    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
269    TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
270    break;
271  }
272
273  case AMDGPU::MOV_IMM_F32:
274    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
275                                                            .getFPImm()
276                                                            ->getValueAPF()
277                                                            .bitcastToAPInt()
278                                                            .getZExtValue());
279    break;
280  case AMDGPU::MOV_IMM_I32:
281    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
282                     MI.getOperand(1).getImm());
283    break;
284  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
285    //TODO: Perhaps combine this instruction with the next if possible
286    auto MIB = TII->buildDefaultInstruction(
287        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
288    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
289    //TODO: Ugh this is rather ugly
290    MIB->getOperand(Idx) = MI.getOperand(1);
291    break;
292  }
293  case AMDGPU::CONST_COPY: {
294    MachineInstr *NewMI = TII->buildDefaultInstruction(
295        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
296    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
297                       MI.getOperand(1).getImm());
298    break;
299  }
300
301  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
302  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
303  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
304    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
305        .addOperand(MI.getOperand(0))
306        .addOperand(MI.getOperand(1))
307        .addImm(isEOP(I)); // Set End of program bit
308    break;
309  }
310  case AMDGPU::RAT_STORE_TYPED_eg: {
311    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
312        .addOperand(MI.getOperand(0))
313        .addOperand(MI.getOperand(1))
314        .addOperand(MI.getOperand(2))
315        .addImm(isEOP(I)); // Set End of program bit
316    break;
317  }
318
319  case AMDGPU::TXD: {
320    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
321    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322    MachineOperand &RID = MI.getOperand(4);
323    MachineOperand &SID = MI.getOperand(5);
324    unsigned TextureId = MI.getOperand(6).getImm();
325    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
326    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
327
328    switch (TextureId) {
329    case 5: // Rect
330      CTX = CTY = 0;
331      break;
332    case 6: // Shadow1D
333      SrcW = SrcZ;
334      break;
335    case 7: // Shadow2D
336      SrcW = SrcZ;
337      break;
338    case 8: // ShadowRect
339      CTX = CTY = 0;
340      SrcW = SrcZ;
341      break;
342    case 9: // 1DArray
343      SrcZ = SrcY;
344      CTZ = 0;
345      break;
346    case 10: // 2DArray
347      CTZ = 0;
348      break;
349    case 11: // Shadow1DArray
350      SrcZ = SrcY;
351      CTZ = 0;
352      break;
353    case 12: // Shadow2DArray
354      CTZ = 0;
355      break;
356    }
357    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
358            T0)
359        .addOperand(MI.getOperand(3))
360        .addImm(SrcX)
361        .addImm(SrcY)
362        .addImm(SrcZ)
363        .addImm(SrcW)
364        .addImm(0)
365        .addImm(0)
366        .addImm(0)
367        .addImm(0)
368        .addImm(1)
369        .addImm(2)
370        .addImm(3)
371        .addOperand(RID)
372        .addOperand(SID)
373        .addImm(CTX)
374        .addImm(CTY)
375        .addImm(CTZ)
376        .addImm(CTW);
377    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
378            T1)
379        .addOperand(MI.getOperand(2))
380        .addImm(SrcX)
381        .addImm(SrcY)
382        .addImm(SrcZ)
383        .addImm(SrcW)
384        .addImm(0)
385        .addImm(0)
386        .addImm(0)
387        .addImm(0)
388        .addImm(1)
389        .addImm(2)
390        .addImm(3)
391        .addOperand(RID)
392        .addOperand(SID)
393        .addImm(CTX)
394        .addImm(CTY)
395        .addImm(CTZ)
396        .addImm(CTW);
397    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
398        .addOperand(MI.getOperand(0))
399        .addOperand(MI.getOperand(1))
400        .addImm(SrcX)
401        .addImm(SrcY)
402        .addImm(SrcZ)
403        .addImm(SrcW)
404        .addImm(0)
405        .addImm(0)
406        .addImm(0)
407        .addImm(0)
408        .addImm(1)
409        .addImm(2)
410        .addImm(3)
411        .addOperand(RID)
412        .addOperand(SID)
413        .addImm(CTX)
414        .addImm(CTY)
415        .addImm(CTZ)
416        .addImm(CTW)
417        .addReg(T0, RegState::Implicit)
418        .addReg(T1, RegState::Implicit);
419    break;
420  }
421
422  case AMDGPU::TXD_SHADOW: {
423    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
424    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
425    MachineOperand &RID = MI.getOperand(4);
426    MachineOperand &SID = MI.getOperand(5);
427    unsigned TextureId = MI.getOperand(6).getImm();
428    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
429    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
430
431    switch (TextureId) {
432    case 5: // Rect
433      CTX = CTY = 0;
434      break;
435    case 6: // Shadow1D
436      SrcW = SrcZ;
437      break;
438    case 7: // Shadow2D
439      SrcW = SrcZ;
440      break;
441    case 8: // ShadowRect
442      CTX = CTY = 0;
443      SrcW = SrcZ;
444      break;
445    case 9: // 1DArray
446      SrcZ = SrcY;
447      CTZ = 0;
448      break;
449    case 10: // 2DArray
450      CTZ = 0;
451      break;
452    case 11: // Shadow1DArray
453      SrcZ = SrcY;
454      CTZ = 0;
455      break;
456    case 12: // Shadow2DArray
457      CTZ = 0;
458      break;
459    }
460
461    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
462            T0)
463        .addOperand(MI.getOperand(3))
464        .addImm(SrcX)
465        .addImm(SrcY)
466        .addImm(SrcZ)
467        .addImm(SrcW)
468        .addImm(0)
469        .addImm(0)
470        .addImm(0)
471        .addImm(0)
472        .addImm(1)
473        .addImm(2)
474        .addImm(3)
475        .addOperand(RID)
476        .addOperand(SID)
477        .addImm(CTX)
478        .addImm(CTY)
479        .addImm(CTZ)
480        .addImm(CTW);
481    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
482            T1)
483        .addOperand(MI.getOperand(2))
484        .addImm(SrcX)
485        .addImm(SrcY)
486        .addImm(SrcZ)
487        .addImm(SrcW)
488        .addImm(0)
489        .addImm(0)
490        .addImm(0)
491        .addImm(0)
492        .addImm(1)
493        .addImm(2)
494        .addImm(3)
495        .addOperand(RID)
496        .addOperand(SID)
497        .addImm(CTX)
498        .addImm(CTY)
499        .addImm(CTZ)
500        .addImm(CTW);
501    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
502        .addOperand(MI.getOperand(0))
503        .addOperand(MI.getOperand(1))
504        .addImm(SrcX)
505        .addImm(SrcY)
506        .addImm(SrcZ)
507        .addImm(SrcW)
508        .addImm(0)
509        .addImm(0)
510        .addImm(0)
511        .addImm(0)
512        .addImm(1)
513        .addImm(2)
514        .addImm(3)
515        .addOperand(RID)
516        .addOperand(SID)
517        .addImm(CTX)
518        .addImm(CTY)
519        .addImm(CTZ)
520        .addImm(CTW)
521        .addReg(T0, RegState::Implicit)
522        .addReg(T1, RegState::Implicit);
523    break;
524  }
525
526  case AMDGPU::BRANCH:
527    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
528        .addOperand(MI.getOperand(0));
529    break;
530
531  case AMDGPU::BRANCH_COND_f32: {
532    MachineInstr *NewMI =
533        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
534                AMDGPU::PREDICATE_BIT)
535            .addOperand(MI.getOperand(1))
536            .addImm(OPCODE_IS_NOT_ZERO)
537            .addImm(0); // Flags
538    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
539    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
540        .addOperand(MI.getOperand(0))
541        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
542    break;
543  }
544
545  case AMDGPU::BRANCH_COND_i32: {
546    MachineInstr *NewMI =
547        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
548                AMDGPU::PREDICATE_BIT)
549            .addOperand(MI.getOperand(1))
550            .addImm(OPCODE_IS_NOT_ZERO_INT)
551            .addImm(0); // Flags
552    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
553    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
554        .addOperand(MI.getOperand(0))
555        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
556    break;
557  }
558
559  case AMDGPU::EG_ExportSwz:
560  case AMDGPU::R600_ExportSwz: {
561    // Instruction is left unmodified if its not the last one of its type
562    bool isLastInstructionOfItsType = true;
563    unsigned InstExportType = MI.getOperand(1).getImm();
564    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
565         EndBlock = BB->end(); NextExportInst != EndBlock;
566         NextExportInst = std::next(NextExportInst)) {
567      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
568          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
569        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
570            .getImm();
571        if (CurrentInstExportType == InstExportType) {
572          isLastInstructionOfItsType = false;
573          break;
574        }
575      }
576    }
577    bool EOP = isEOP(I);
578    if (!EOP && !isLastInstructionOfItsType)
579      return BB;
580    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
581    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
582        .addOperand(MI.getOperand(0))
583        .addOperand(MI.getOperand(1))
584        .addOperand(MI.getOperand(2))
585        .addOperand(MI.getOperand(3))
586        .addOperand(MI.getOperand(4))
587        .addOperand(MI.getOperand(5))
588        .addOperand(MI.getOperand(6))
589        .addImm(CfInst)
590        .addImm(EOP);
591    break;
592  }
593  case AMDGPU::RETURN: {
594    // RETURN instructions must have the live-out registers as implicit uses,
595    // otherwise they appear dead.
596    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
597    MachineInstrBuilder MIB(*MF, MI);
598    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
599      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
600    return BB;
601  }
602  }
603
604  MI.eraseFromParent();
605  return BB;
606}
607
608//===----------------------------------------------------------------------===//
609// Custom DAG Lowering Operations
610//===----------------------------------------------------------------------===//
611
612SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
613  MachineFunction &MF = DAG.getMachineFunction();
614  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
615  switch (Op.getOpcode()) {
616  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
617  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
618  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
619  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
620  case ISD::SRA_PARTS:
621  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
622  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
623  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
624  case ISD::FCOS:
625  case ISD::FSIN: return LowerTrig(Op, DAG);
626  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
627  case ISD::STORE: return LowerSTORE(Op, DAG);
628  case ISD::LOAD: {
629    SDValue Result = LowerLOAD(Op, DAG);
630    assert((!Result.getNode() ||
631            Result.getNode()->getNumValues() == 2) &&
632           "Load should return a value and a chain");
633    return Result;
634  }
635
636  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
637  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
638  case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
639  case ISD::INTRINSIC_VOID: {
640    SDValue Chain = Op.getOperand(0);
641    unsigned IntrinsicID =
642                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
643    switch (IntrinsicID) {
644    case AMDGPUIntrinsic::R600_store_swizzle: {
645      SDLoc DL(Op);
646      const SDValue Args[8] = {
647        Chain,
648        Op.getOperand(2), // Export Value
649        Op.getOperand(3), // ArrayBase
650        Op.getOperand(4), // Type
651        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
652        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
653        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
654        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
655      };
656      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
657    }
658
659    // default for switch(IntrinsicID)
660    default: break;
661    }
662    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
663    break;
664  }
665  case ISD::INTRINSIC_WO_CHAIN: {
666    unsigned IntrinsicID =
667                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
668    EVT VT = Op.getValueType();
669    SDLoc DL(Op);
670    switch(IntrinsicID) {
671    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
672    case AMDGPUIntrinsic::r600_tex:
673    case AMDGPUIntrinsic::r600_texc:
674    case AMDGPUIntrinsic::r600_txl:
675    case AMDGPUIntrinsic::r600_txlc:
676    case AMDGPUIntrinsic::r600_txb:
677    case AMDGPUIntrinsic::r600_txbc:
678    case AMDGPUIntrinsic::r600_txf:
679    case AMDGPUIntrinsic::r600_txq:
680    case AMDGPUIntrinsic::r600_ddx:
681    case AMDGPUIntrinsic::r600_ddy: {
682      unsigned TextureOp;
683      switch (IntrinsicID) {
684      case AMDGPUIntrinsic::r600_tex:
685        TextureOp = 0;
686        break;
687      case AMDGPUIntrinsic::r600_texc:
688        TextureOp = 1;
689        break;
690      case AMDGPUIntrinsic::r600_txl:
691        TextureOp = 2;
692        break;
693      case AMDGPUIntrinsic::r600_txlc:
694        TextureOp = 3;
695        break;
696      case AMDGPUIntrinsic::r600_txb:
697        TextureOp = 4;
698        break;
699      case AMDGPUIntrinsic::r600_txbc:
700        TextureOp = 5;
701        break;
702      case AMDGPUIntrinsic::r600_txf:
703        TextureOp = 6;
704        break;
705      case AMDGPUIntrinsic::r600_txq:
706        TextureOp = 7;
707        break;
708      case AMDGPUIntrinsic::r600_ddx:
709        TextureOp = 8;
710        break;
711      case AMDGPUIntrinsic::r600_ddy:
712        TextureOp = 9;
713        break;
714      default:
715        llvm_unreachable("Unknow Texture Operation");
716      }
717
718      SDValue TexArgs[19] = {
719        DAG.getConstant(TextureOp, DL, MVT::i32),
720        Op.getOperand(1),
721        DAG.getConstant(0, DL, MVT::i32),
722        DAG.getConstant(1, DL, MVT::i32),
723        DAG.getConstant(2, DL, MVT::i32),
724        DAG.getConstant(3, DL, MVT::i32),
725        Op.getOperand(2),
726        Op.getOperand(3),
727        Op.getOperand(4),
728        DAG.getConstant(0, DL, MVT::i32),
729        DAG.getConstant(1, DL, MVT::i32),
730        DAG.getConstant(2, DL, MVT::i32),
731        DAG.getConstant(3, DL, MVT::i32),
732        Op.getOperand(5),
733        Op.getOperand(6),
734        Op.getOperand(7),
735        Op.getOperand(8),
736        Op.getOperand(9),
737        Op.getOperand(10)
738      };
739      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
740    }
741    case AMDGPUIntrinsic::r600_dot4: {
742      SDValue Args[8] = {
743      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
744          DAG.getConstant(0, DL, MVT::i32)),
745      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
746          DAG.getConstant(0, DL, MVT::i32)),
747      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
748          DAG.getConstant(1, DL, MVT::i32)),
749      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
750          DAG.getConstant(1, DL, MVT::i32)),
751      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
752          DAG.getConstant(2, DL, MVT::i32)),
753      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
754          DAG.getConstant(2, DL, MVT::i32)),
755      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
756          DAG.getConstant(3, DL, MVT::i32)),
757      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
758          DAG.getConstant(3, DL, MVT::i32))
759      };
760      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
761    }
762
763    case Intrinsic::r600_implicitarg_ptr: {
764      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
765      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
766      return DAG.getConstant(ByteOffset, DL, PtrVT);
767    }
768    case Intrinsic::r600_read_ngroups_x:
769      return LowerImplicitParameter(DAG, VT, DL, 0);
770    case Intrinsic::r600_read_ngroups_y:
771      return LowerImplicitParameter(DAG, VT, DL, 1);
772    case Intrinsic::r600_read_ngroups_z:
773      return LowerImplicitParameter(DAG, VT, DL, 2);
774    case Intrinsic::r600_read_global_size_x:
775      return LowerImplicitParameter(DAG, VT, DL, 3);
776    case Intrinsic::r600_read_global_size_y:
777      return LowerImplicitParameter(DAG, VT, DL, 4);
778    case Intrinsic::r600_read_global_size_z:
779      return LowerImplicitParameter(DAG, VT, DL, 5);
780    case Intrinsic::r600_read_local_size_x:
781      return LowerImplicitParameter(DAG, VT, DL, 6);
782    case Intrinsic::r600_read_local_size_y:
783      return LowerImplicitParameter(DAG, VT, DL, 7);
784    case Intrinsic::r600_read_local_size_z:
785      return LowerImplicitParameter(DAG, VT, DL, 8);
786
787    case Intrinsic::r600_read_workdim:
788    case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
789      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
790      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
791    }
792
793    case Intrinsic::r600_read_tgid_x:
794      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
795                                  AMDGPU::T1_X, VT);
796    case Intrinsic::r600_read_tgid_y:
797      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
798                                  AMDGPU::T1_Y, VT);
799    case Intrinsic::r600_read_tgid_z:
800      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
801                                  AMDGPU::T1_Z, VT);
802    case Intrinsic::r600_read_tidig_x:
803      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
804                                  AMDGPU::T0_X, VT);
805    case Intrinsic::r600_read_tidig_y:
806      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
807                                  AMDGPU::T0_Y, VT);
808    case Intrinsic::r600_read_tidig_z:
809      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
810                                  AMDGPU::T0_Z, VT);
811
812    // FIXME: Should be renamed to r600 prefix
813    case AMDGPUIntrinsic::AMDGPU_rsq_clamped:
814      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
815
816    case Intrinsic::r600_rsq:
817    case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
818      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
819      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
820    }
821    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
822    break;
823  }
824  } // end switch(Op.getOpcode())
825  return SDValue();
826}
827
828void R600TargetLowering::ReplaceNodeResults(SDNode *N,
829                                            SmallVectorImpl<SDValue> &Results,
830                                            SelectionDAG &DAG) const {
831  switch (N->getOpcode()) {
832  default:
833    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
834    return;
835  case ISD::FP_TO_UINT:
836    if (N->getValueType(0) == MVT::i1) {
837      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
838      return;
839    }
840    // Fall-through. Since we don't care about out of bounds values
841    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
842    // considers some extra cases which are not necessary here.
843  case ISD::FP_TO_SINT: {
844    SDValue Result;
845    if (expandFP_TO_SINT(N, Result, DAG))
846      Results.push_back(Result);
847    return;
848  }
849  case ISD::SDIVREM: {
850    SDValue Op = SDValue(N, 1);
851    SDValue RES = LowerSDIVREM(Op, DAG);
852    Results.push_back(RES);
853    Results.push_back(RES.getValue(1));
854    break;
855  }
856  case ISD::UDIVREM: {
857    SDValue Op = SDValue(N, 0);
858    LowerUDIVREM64(Op, DAG, Results);
859    break;
860  }
861  }
862}
863
864SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
865                                                   SDValue Vector) const {
866
867  SDLoc DL(Vector);
868  EVT VecVT = Vector.getValueType();
869  EVT EltVT = VecVT.getVectorElementType();
870  SmallVector<SDValue, 8> Args;
871
872  for (unsigned i = 0, e = VecVT.getVectorNumElements();
873                                                           i != e; ++i) {
874    Args.push_back(DAG.getNode(
875        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
876        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
877  }
878
879  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
880}
881
882SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
883                                                    SelectionDAG &DAG) const {
884
885  SDLoc DL(Op);
886  SDValue Vector = Op.getOperand(0);
887  SDValue Index = Op.getOperand(1);
888
889  if (isa<ConstantSDNode>(Index) ||
890      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
891    return Op;
892
893  Vector = vectorToVerticalVector(DAG, Vector);
894  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
895                     Vector, Index);
896}
897
898SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
899                                                   SelectionDAG &DAG) const {
900  SDLoc DL(Op);
901  SDValue Vector = Op.getOperand(0);
902  SDValue Value = Op.getOperand(1);
903  SDValue Index = Op.getOperand(2);
904
905  if (isa<ConstantSDNode>(Index) ||
906      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
907    return Op;
908
909  Vector = vectorToVerticalVector(DAG, Vector);
910  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
911                               Vector, Value, Index);
912  return vectorToVerticalVector(DAG, Insert);
913}
914
915SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
916                                               SDValue Op,
917                                               SelectionDAG &DAG) const {
918
919  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
920  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
921    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
922
923  const DataLayout &DL = DAG.getDataLayout();
924  const GlobalValue *GV = GSD->getGlobal();
925  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
926
927  SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
928  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
929}
930
931SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
932  // On hw >= R700, COS/SIN input must be between -1. and 1.
933  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
934  EVT VT = Op.getValueType();
935  SDValue Arg = Op.getOperand(0);
936  SDLoc DL(Op);
937
938  // TODO: Should this propagate fast-math-flags?
939  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
940      DAG.getNode(ISD::FADD, DL, VT,
941        DAG.getNode(ISD::FMUL, DL, VT, Arg,
942          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
943        DAG.getConstantFP(0.5, DL, MVT::f32)));
944  unsigned TrigNode;
945  switch (Op.getOpcode()) {
946  case ISD::FCOS:
947    TrigNode = AMDGPUISD::COS_HW;
948    break;
949  case ISD::FSIN:
950    TrigNode = AMDGPUISD::SIN_HW;
951    break;
952  default:
953    llvm_unreachable("Wrong trig opcode");
954  }
955  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
956      DAG.getNode(ISD::FADD, DL, VT, FractPart,
957        DAG.getConstantFP(-0.5, DL, MVT::f32)));
958  if (Gen >= R600Subtarget::R700)
959    return TrigVal;
960  // On R600 hw, COS/SIN input must be between -Pi and Pi.
961  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
962      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
963}
964
965SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
966  SDLoc DL(Op);
967  EVT VT = Op.getValueType();
968
969  SDValue Lo = Op.getOperand(0);
970  SDValue Hi = Op.getOperand(1);
971  SDValue Shift = Op.getOperand(2);
972  SDValue Zero = DAG.getConstant(0, DL, VT);
973  SDValue One  = DAG.getConstant(1, DL, VT);
974
975  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
976  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
977  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
978  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
979
980  // The dance around Width1 is necessary for 0 special case.
981  // Without it the CompShift might be 32, producing incorrect results in
982  // Overflow. So we do the shift in two steps, the alternative is to
983  // add a conditional to filter the special case.
984
985  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
986  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
987
988  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
989  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
990  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
991
992  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
993  SDValue LoBig = Zero;
994
995  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
996  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
997
998  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
999}
1000
1001SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1002  SDLoc DL(Op);
1003  EVT VT = Op.getValueType();
1004
1005  SDValue Lo = Op.getOperand(0);
1006  SDValue Hi = Op.getOperand(1);
1007  SDValue Shift = Op.getOperand(2);
1008  SDValue Zero = DAG.getConstant(0, DL, VT);
1009  SDValue One  = DAG.getConstant(1, DL, VT);
1010
1011  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1012
1013  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1014  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1015  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1016  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1017
1018  // The dance around Width1 is necessary for 0 special case.
1019  // Without it the CompShift might be 32, producing incorrect results in
1020  // Overflow. So we do the shift in two steps, the alternative is to
1021  // add a conditional to filter the special case.
1022
1023  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1024  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1025
1026  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1027  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1028  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1029
1030  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1031  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1032
1033  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1034  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1035
1036  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1037}
1038
1039SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1040                                          unsigned mainop, unsigned ovf) const {
1041  SDLoc DL(Op);
1042  EVT VT = Op.getValueType();
1043
1044  SDValue Lo = Op.getOperand(0);
1045  SDValue Hi = Op.getOperand(1);
1046
1047  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1048  // Extend sign.
1049  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1050                    DAG.getValueType(MVT::i1));
1051
1052  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1053
1054  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1055}
1056
1057SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1058  SDLoc DL(Op);
1059  return DAG.getNode(
1060      ISD::SETCC,
1061      DL,
1062      MVT::i1,
1063      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1064      DAG.getCondCode(ISD::SETNE)
1065      );
1066}
1067
1068SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1069                                                   const SDLoc &DL,
1070                                                   unsigned DwordOffset) const {
1071  unsigned ByteOffset = DwordOffset * 4;
1072  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1073                                      AMDGPUAS::CONSTANT_BUFFER_0);
1074
1075  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1076  assert(isInt<16>(ByteOffset));
1077
1078  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1079                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1080                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1081                     false, false, false, 0);
1082}
1083
1084bool R600TargetLowering::isZero(SDValue Op) const {
1085  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1086    return Cst->isNullValue();
1087  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1088    return CstFP->isZero();
1089  } else {
1090    return false;
1091  }
1092}
1093
1094bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1095  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1096    return CFP->isExactlyValue(1.0);
1097  }
1098  return isAllOnesConstant(Op);
1099}
1100
1101bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1102  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1103    return CFP->getValueAPF().isZero();
1104  }
1105  return isNullConstant(Op);
1106}
1107
1108SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1109  SDLoc DL(Op);
1110  EVT VT = Op.getValueType();
1111
1112  SDValue LHS = Op.getOperand(0);
1113  SDValue RHS = Op.getOperand(1);
1114  SDValue True = Op.getOperand(2);
1115  SDValue False = Op.getOperand(3);
1116  SDValue CC = Op.getOperand(4);
1117  SDValue Temp;
1118
1119  if (VT == MVT::f32) {
1120    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1121    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1122    if (MinMax)
1123      return MinMax;
1124  }
1125
1126  // LHS and RHS are guaranteed to be the same value type
1127  EVT CompareVT = LHS.getValueType();
1128
1129  // Check if we can lower this to a native operation.
1130
1131  // Try to lower to a SET* instruction:
1132  //
1133  // SET* can match the following patterns:
1134  //
1135  // select_cc f32, f32, -1,  0, cc_supported
1136  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1137  // select_cc i32, i32, -1,  0, cc_supported
1138  //
1139
1140  // Move hardware True/False values to the correct operand.
1141  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1142  ISD::CondCode InverseCC =
1143     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1144  if (isHWTrueValue(False) && isHWFalseValue(True)) {
1145    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1146      std::swap(False, True);
1147      CC = DAG.getCondCode(InverseCC);
1148    } else {
1149      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1150      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1151        std::swap(False, True);
1152        std::swap(LHS, RHS);
1153        CC = DAG.getCondCode(SwapInvCC);
1154      }
1155    }
1156  }
1157
1158  if (isHWTrueValue(True) && isHWFalseValue(False) &&
1159      (CompareVT == VT || VT == MVT::i32)) {
1160    // This can be matched by a SET* instruction.
1161    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1162  }
1163
1164  // Try to lower to a CND* instruction:
1165  //
1166  // CND* can match the following patterns:
1167  //
1168  // select_cc f32, 0.0, f32, f32, cc_supported
1169  // select_cc f32, 0.0, i32, i32, cc_supported
1170  // select_cc i32, 0,   f32, f32, cc_supported
1171  // select_cc i32, 0,   i32, i32, cc_supported
1172  //
1173
1174  // Try to move the zero value to the RHS
1175  if (isZero(LHS)) {
1176    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1177    // Try swapping the operands
1178    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1179    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1180      std::swap(LHS, RHS);
1181      CC = DAG.getCondCode(CCSwapped);
1182    } else {
1183      // Try inverting the conditon and then swapping the operands
1184      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1185      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1186      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1187        std::swap(True, False);
1188        std::swap(LHS, RHS);
1189        CC = DAG.getCondCode(CCSwapped);
1190      }
1191    }
1192  }
1193  if (isZero(RHS)) {
1194    SDValue Cond = LHS;
1195    SDValue Zero = RHS;
1196    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1197    if (CompareVT != VT) {
1198      // Bitcast True / False to the correct types.  This will end up being
1199      // a nop, but it allows us to define only a single pattern in the
1200      // .TD files for each CND* instruction rather than having to have
1201      // one pattern for integer True/False and one for fp True/False
1202      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1203      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1204    }
1205
1206    switch (CCOpcode) {
1207    case ISD::SETONE:
1208    case ISD::SETUNE:
1209    case ISD::SETNE:
1210      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1211      Temp = True;
1212      True = False;
1213      False = Temp;
1214      break;
1215    default:
1216      break;
1217    }
1218    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1219        Cond, Zero,
1220        True, False,
1221        DAG.getCondCode(CCOpcode));
1222    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1223  }
1224
1225  // If we make it this for it means we have no native instructions to handle
1226  // this SELECT_CC, so we must lower it.
1227  SDValue HWTrue, HWFalse;
1228
1229  if (CompareVT == MVT::f32) {
1230    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1231    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1232  } else if (CompareVT == MVT::i32) {
1233    HWTrue = DAG.getConstant(-1, DL, CompareVT);
1234    HWFalse = DAG.getConstant(0, DL, CompareVT);
1235  }
1236  else {
1237    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1238  }
1239
1240  // Lower this unsupported SELECT_CC into a combination of two supported
1241  // SELECT_CC operations.
1242  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1243
1244  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1245      Cond, HWFalse,
1246      True, False,
1247      DAG.getCondCode(ISD::SETNE));
1248}
1249
1250/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1251/// convert these pointers to a register index.  Each register holds
1252/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1253/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1254/// for indirect addressing.
1255SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1256                                               unsigned StackWidth,
1257                                               SelectionDAG &DAG) const {
1258  unsigned SRLPad;
1259  switch(StackWidth) {
1260  case 1:
1261    SRLPad = 2;
1262    break;
1263  case 2:
1264    SRLPad = 3;
1265    break;
1266  case 4:
1267    SRLPad = 4;
1268    break;
1269  default: llvm_unreachable("Invalid stack width");
1270  }
1271
1272  SDLoc DL(Ptr);
1273  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1274                     DAG.getConstant(SRLPad, DL, MVT::i32));
1275}
1276
1277void R600TargetLowering::getStackAddress(unsigned StackWidth,
1278                                         unsigned ElemIdx,
1279                                         unsigned &Channel,
1280                                         unsigned &PtrIncr) const {
1281  switch (StackWidth) {
1282  default:
1283  case 1:
1284    Channel = 0;
1285    if (ElemIdx > 0) {
1286      PtrIncr = 1;
1287    } else {
1288      PtrIncr = 0;
1289    }
1290    break;
1291  case 2:
1292    Channel = ElemIdx % 2;
1293    if (ElemIdx == 2) {
1294      PtrIncr = 1;
1295    } else {
1296      PtrIncr = 0;
1297    }
1298    break;
1299  case 4:
1300    Channel = ElemIdx;
1301    PtrIncr = 0;
1302    break;
1303  }
1304}
1305
1306SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1307                                                   SelectionDAG &DAG) const {
1308  SDLoc DL(Store);
1309
1310  unsigned Mask = 0;
1311  if (Store->getMemoryVT() == MVT::i8) {
1312    Mask = 0xff;
1313  } else if (Store->getMemoryVT() == MVT::i16) {
1314    Mask = 0xffff;
1315  }
1316
1317  SDValue Chain = Store->getChain();
1318  SDValue BasePtr = Store->getBasePtr();
1319  EVT MemVT = Store->getMemoryVT();
1320
1321  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1322                            DAG.getConstant(2, DL, MVT::i32));
1323  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1324                            Chain, Ptr,
1325                            DAG.getTargetConstant(0, DL, MVT::i32));
1326
1327  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1328                                DAG.getConstant(0x3, DL, MVT::i32));
1329
1330  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1331                                 DAG.getConstant(3, DL, MVT::i32));
1332
1333  SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1334                                  Store->getValue());
1335
1336  SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1337
1338  SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1339                                     MaskedValue, ShiftAmt);
1340
1341  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1342                                DAG.getConstant(Mask, DL, MVT::i32),
1343                                ShiftAmt);
1344  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1345                        DAG.getConstant(0xffffffff, DL, MVT::i32));
1346  Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1347
1348  SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1349  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1350                     Chain, Value, Ptr,
1351                     DAG.getTargetConstant(0, DL, MVT::i32));
1352}
1353
1354SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1355  if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1356    return Result;
1357
1358  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1359  unsigned AS = StoreNode->getAddressSpace();
1360  SDValue Value = StoreNode->getValue();
1361  EVT ValueVT = Value.getValueType();
1362
1363  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1364      ValueVT.isVector()) {
1365    return SplitVectorStore(Op, DAG);
1366  }
1367
1368  SDLoc DL(Op);
1369  SDValue Chain = StoreNode->getChain();
1370  SDValue Ptr = StoreNode->getBasePtr();
1371
1372  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1373    if (StoreNode->isTruncatingStore()) {
1374      EVT VT = Value.getValueType();
1375      assert(VT.bitsLE(MVT::i32));
1376      EVT MemVT = StoreNode->getMemoryVT();
1377      SDValue MaskConstant;
1378      if (MemVT == MVT::i8) {
1379        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1380      } else {
1381        assert(MemVT == MVT::i16);
1382        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1383      }
1384      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1385                                      DAG.getConstant(2, DL, MVT::i32));
1386      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1387                                      DAG.getConstant(0x00000003, DL, VT));
1388      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1389      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1390                                   DAG.getConstant(3, DL, VT));
1391      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1392      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1393      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1394      // vector instead.
1395      SDValue Src[4] = {
1396        ShiftedValue,
1397        DAG.getConstant(0, DL, MVT::i32),
1398        DAG.getConstant(0, DL, MVT::i32),
1399        Mask
1400      };
1401      SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1402      SDValue Args[3] = { Chain, Input, DWordAddr };
1403      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1404                                     Op->getVTList(), Args, MemVT,
1405                                     StoreNode->getMemOperand());
1406    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1407               ValueVT.bitsGE(MVT::i32)) {
1408      // Convert pointer from byte address to dword address.
1409      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1410                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1411                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
1412
1413      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1414        llvm_unreachable("Truncated and indexed stores not supported yet");
1415      } else {
1416        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1417      }
1418      return Chain;
1419    }
1420  }
1421
1422  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1423    return SDValue();
1424
1425  EVT MemVT = StoreNode->getMemoryVT();
1426  if (MemVT.bitsLT(MVT::i32))
1427    return lowerPrivateTruncStore(StoreNode, DAG);
1428
1429  // Lowering for indirect addressing
1430  const MachineFunction &MF = DAG.getMachineFunction();
1431  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1432  unsigned StackWidth = TFL->getStackWidth(MF);
1433
1434  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1435
1436  if (ValueVT.isVector()) {
1437    unsigned NumElemVT = ValueVT.getVectorNumElements();
1438    EVT ElemVT = ValueVT.getVectorElementType();
1439    SmallVector<SDValue, 4> Stores(NumElemVT);
1440
1441    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1442                                      "vector width in load");
1443
1444    for (unsigned i = 0; i < NumElemVT; ++i) {
1445      unsigned Channel, PtrIncr;
1446      getStackAddress(StackWidth, i, Channel, PtrIncr);
1447      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1448                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1449      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1450                                 Value, DAG.getConstant(i, DL, MVT::i32));
1451
1452      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1453                              Chain, Elem, Ptr,
1454                              DAG.getTargetConstant(Channel, DL, MVT::i32));
1455    }
1456     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1457   } else {
1458    if (ValueVT == MVT::i8) {
1459      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1460    }
1461    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1462    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1463  }
1464
1465  return Chain;
1466}
1467
1468// return (512 + (kc_bank << 12)
1469static int
1470ConstantAddressBlock(unsigned AddressSpace) {
1471  switch (AddressSpace) {
1472  case AMDGPUAS::CONSTANT_BUFFER_0:
1473    return 512;
1474  case AMDGPUAS::CONSTANT_BUFFER_1:
1475    return 512 + 4096;
1476  case AMDGPUAS::CONSTANT_BUFFER_2:
1477    return 512 + 4096 * 2;
1478  case AMDGPUAS::CONSTANT_BUFFER_3:
1479    return 512 + 4096 * 3;
1480  case AMDGPUAS::CONSTANT_BUFFER_4:
1481    return 512 + 4096 * 4;
1482  case AMDGPUAS::CONSTANT_BUFFER_5:
1483    return 512 + 4096 * 5;
1484  case AMDGPUAS::CONSTANT_BUFFER_6:
1485    return 512 + 4096 * 6;
1486  case AMDGPUAS::CONSTANT_BUFFER_7:
1487    return 512 + 4096 * 7;
1488  case AMDGPUAS::CONSTANT_BUFFER_8:
1489    return 512 + 4096 * 8;
1490  case AMDGPUAS::CONSTANT_BUFFER_9:
1491    return 512 + 4096 * 9;
1492  case AMDGPUAS::CONSTANT_BUFFER_10:
1493    return 512 + 4096 * 10;
1494  case AMDGPUAS::CONSTANT_BUFFER_11:
1495    return 512 + 4096 * 11;
1496  case AMDGPUAS::CONSTANT_BUFFER_12:
1497    return 512 + 4096 * 12;
1498  case AMDGPUAS::CONSTANT_BUFFER_13:
1499    return 512 + 4096 * 13;
1500  case AMDGPUAS::CONSTANT_BUFFER_14:
1501    return 512 + 4096 * 14;
1502  case AMDGPUAS::CONSTANT_BUFFER_15:
1503    return 512 + 4096 * 15;
1504  default:
1505    return -1;
1506  }
1507}
1508
1509SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1510                                                SelectionDAG &DAG) const {
1511  SDLoc DL(Op);
1512  LoadSDNode *Load = cast<LoadSDNode>(Op);
1513  ISD::LoadExtType ExtType = Load->getExtensionType();
1514  EVT MemVT = Load->getMemoryVT();
1515
1516  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1517  // register (2-)byte extract.
1518
1519  // Get Register holding the target.
1520  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1521                            DAG.getConstant(2, DL, MVT::i32));
1522  // Load the Register.
1523  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1524                            Load->getChain(),
1525                            Ptr,
1526                            DAG.getTargetConstant(0, DL, MVT::i32),
1527                            Op.getOperand(2));
1528
1529  // Get offset within the register.
1530  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1531                                Load->getBasePtr(),
1532                                DAG.getConstant(0x3, DL, MVT::i32));
1533
1534  // Bit offset of target byte (byteIdx * 8).
1535  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1536                                 DAG.getConstant(3, DL, MVT::i32));
1537
1538  // Shift to the right.
1539  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1540
1541  // Eliminate the upper bits by setting them to ...
1542  EVT MemEltVT = MemVT.getScalarType();
1543
1544  // ... ones.
1545  if (ExtType == ISD::SEXTLOAD) {
1546    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1547
1548    SDValue Ops[] = {
1549      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1550      Load->getChain()
1551    };
1552
1553    return DAG.getMergeValues(Ops, DL);
1554  }
1555
1556  // ... or zeros.
1557  SDValue Ops[] = {
1558    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1559    Load->getChain()
1560  };
1561
1562  return DAG.getMergeValues(Ops, DL);
1563}
1564
1565SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1566  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1567  unsigned AS = LoadNode->getAddressSpace();
1568  EVT MemVT = LoadNode->getMemoryVT();
1569  ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1570
1571  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1572      ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1573    return lowerPrivateExtLoad(Op, DAG);
1574  }
1575
1576  SDLoc DL(Op);
1577  EVT VT = Op.getValueType();
1578  SDValue Chain = LoadNode->getChain();
1579  SDValue Ptr = LoadNode->getBasePtr();
1580
1581  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1582    SDValue MergedValues[2] = {
1583      scalarizeVectorLoad(LoadNode, DAG),
1584      Chain
1585    };
1586    return DAG.getMergeValues(MergedValues, DL);
1587  }
1588
1589  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1590  if (ConstantBlock > -1 &&
1591      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1592       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1593    SDValue Result;
1594    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1595        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1596        isa<ConstantSDNode>(Ptr)) {
1597      SDValue Slots[4];
1598      for (unsigned i = 0; i < 4; i++) {
1599        // We want Const position encoded with the following formula :
1600        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1601        // const_index is Ptr computed by llvm using an alignment of 16.
1602        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1603        // then div by 4 at the ISel step
1604        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1605            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1606        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1607      }
1608      EVT NewVT = MVT::v4i32;
1609      unsigned NumElements = 4;
1610      if (VT.isVector()) {
1611        NewVT = VT;
1612        NumElements = VT.getVectorNumElements();
1613      }
1614      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1615    } else {
1616      // non-constant ptr can't be folded, keeps it as a v4f32 load
1617      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1618          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1619                      DAG.getConstant(4, DL, MVT::i32)),
1620                      DAG.getConstant(LoadNode->getAddressSpace() -
1621                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1622          );
1623    }
1624
1625    if (!VT.isVector()) {
1626      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1627                           DAG.getConstant(0, DL, MVT::i32));
1628    }
1629
1630    SDValue MergedValues[2] = {
1631      Result,
1632      Chain
1633    };
1634    return DAG.getMergeValues(MergedValues, DL);
1635  }
1636
1637  SDValue LoweredLoad;
1638
1639  // For most operations returning SDValue() will result in the node being
1640  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1641  // need to manually expand loads that may be legal in some address spaces and
1642  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1643  // compute shaders, since the data is sign extended when it is uploaded to the
1644  // buffer. However SEXT loads from other address spaces are not supported, so
1645  // we need to expand them here.
1646  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1647    EVT MemVT = LoadNode->getMemoryVT();
1648    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1649    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1650                                  LoadNode->getPointerInfo(), MemVT,
1651                                  LoadNode->isVolatile(),
1652                                  LoadNode->isNonTemporal(),
1653                                  LoadNode->isInvariant(),
1654                                  LoadNode->getAlignment());
1655    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1656                              DAG.getValueType(MemVT));
1657
1658    SDValue MergedValues[2] = { Res, Chain };
1659    return DAG.getMergeValues(MergedValues, DL);
1660  }
1661
1662  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1663    return SDValue();
1664  }
1665
1666  // Lowering for indirect addressing
1667  const MachineFunction &MF = DAG.getMachineFunction();
1668  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1669  unsigned StackWidth = TFL->getStackWidth(MF);
1670
1671  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1672
1673  if (VT.isVector()) {
1674    unsigned NumElemVT = VT.getVectorNumElements();
1675    EVT ElemVT = VT.getVectorElementType();
1676    SDValue Loads[4];
1677
1678    assert(NumElemVT <= 4);
1679    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1680                                      "vector width in load");
1681
1682    for (unsigned i = 0; i < NumElemVT; ++i) {
1683      unsigned Channel, PtrIncr;
1684      getStackAddress(StackWidth, i, Channel, PtrIncr);
1685      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1686                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1687      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1688                             Chain, Ptr,
1689                             DAG.getTargetConstant(Channel, DL, MVT::i32),
1690                             Op.getOperand(2));
1691    }
1692    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1693    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1694  } else {
1695    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1696                              Chain, Ptr,
1697                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1698                              Op.getOperand(2));
1699  }
1700
1701  SDValue Ops[2] = {
1702    LoweredLoad,
1703    Chain
1704  };
1705
1706  return DAG.getMergeValues(Ops, DL);
1707}
1708
1709SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1710  SDValue Chain = Op.getOperand(0);
1711  SDValue Cond  = Op.getOperand(1);
1712  SDValue Jump  = Op.getOperand(2);
1713
1714  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1715                     Chain, Jump, Cond);
1716}
1717
1718SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1719                                            SelectionDAG &DAG) const {
1720  MachineFunction &MF = DAG.getMachineFunction();
1721  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1722
1723  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1724
1725  unsigned FrameIndex = FIN->getIndex();
1726  unsigned IgnoredFrameReg;
1727  unsigned Offset =
1728    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1729  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1730                         Op.getValueType());
1731}
1732
1733/// XXX Only kernel functions are supported, so we can assume for now that
1734/// every function is a kernel function, but in the future we should use
1735/// separate calling conventions for kernel and non-kernel functions.
1736SDValue R600TargetLowering::LowerFormalArguments(
1737    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1738    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1739    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1740  SmallVector<CCValAssign, 16> ArgLocs;
1741  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1742                 *DAG.getContext());
1743  MachineFunction &MF = DAG.getMachineFunction();
1744  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1745
1746  SmallVector<ISD::InputArg, 8> LocalIns;
1747
1748  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1749
1750  AnalyzeFormalArguments(CCInfo, LocalIns);
1751
1752  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1753    CCValAssign &VA = ArgLocs[i];
1754    const ISD::InputArg &In = Ins[i];
1755    EVT VT = In.VT;
1756    EVT MemVT = VA.getLocVT();
1757    if (!VT.isVector() && MemVT.isVector()) {
1758      // Get load source type if scalarized.
1759      MemVT = MemVT.getVectorElementType();
1760    }
1761
1762    if (AMDGPU::isShader(CallConv)) {
1763      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1764      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1765      InVals.push_back(Register);
1766      continue;
1767    }
1768
1769    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1770                                          AMDGPUAS::CONSTANT_BUFFER_0);
1771
1772    // i64 isn't a legal type, so the register type used ends up as i32, which
1773    // isn't expected here. It attempts to create this sextload, but it ends up
1774    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1775    // for <1 x i64>.
1776
1777    // The first 36 bytes of the input buffer contains information about
1778    // thread group and global sizes.
1779    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1780    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1781      // FIXME: This should really check the extload type, but the handling of
1782      // extload vector parameters seems to be broken.
1783
1784      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1785      Ext = ISD::SEXTLOAD;
1786    }
1787
1788    // Compute the offset from the value.
1789    // XXX - I think PartOffset should give you this, but it seems to give the
1790    // size of the register which isn't useful.
1791
1792    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1793    unsigned PartOffset = VA.getLocMemOffset();
1794    unsigned Offset = 36 + VA.getLocMemOffset();
1795
1796    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1797    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1798                              DAG.getConstant(Offset, DL, MVT::i32),
1799                              DAG.getUNDEF(MVT::i32),
1800                              PtrInfo,
1801                              MemVT, false, true, true, 4);
1802
1803    // 4 is the preferred alignment for the CONSTANT memory space.
1804    InVals.push_back(Arg);
1805    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1806  }
1807  return Chain;
1808}
1809
1810EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1811                                           EVT VT) const {
1812   if (!VT.isVector())
1813     return MVT::i32;
1814   return VT.changeVectorElementTypeToInteger();
1815}
1816
1817bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1818                                                        unsigned AddrSpace,
1819                                                        unsigned Align,
1820                                                        bool *IsFast) const {
1821  if (IsFast)
1822    *IsFast = false;
1823
1824  if (!VT.isSimple() || VT == MVT::Other)
1825    return false;
1826
1827  if (VT.bitsLT(MVT::i32))
1828    return false;
1829
1830  // TODO: This is a rough estimate.
1831  if (IsFast)
1832    *IsFast = true;
1833
1834  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1835}
1836
1837static SDValue CompactSwizzlableVector(
1838  SelectionDAG &DAG, SDValue VectorEntry,
1839  DenseMap<unsigned, unsigned> &RemapSwizzle) {
1840  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1841  assert(RemapSwizzle.empty());
1842  SDValue NewBldVec[4] = {
1843    VectorEntry.getOperand(0),
1844    VectorEntry.getOperand(1),
1845    VectorEntry.getOperand(2),
1846    VectorEntry.getOperand(3)
1847  };
1848
1849  for (unsigned i = 0; i < 4; i++) {
1850    if (NewBldVec[i].isUndef())
1851      // We mask write here to teach later passes that the ith element of this
1852      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1853      // break false dependencies and additionnaly make assembly easier to read.
1854      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1855    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1856      if (C->isZero()) {
1857        RemapSwizzle[i] = 4; // SEL_0
1858        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1859      } else if (C->isExactlyValue(1.0)) {
1860        RemapSwizzle[i] = 5; // SEL_1
1861        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1862      }
1863    }
1864
1865    if (NewBldVec[i].isUndef())
1866      continue;
1867    for (unsigned j = 0; j < i; j++) {
1868      if (NewBldVec[i] == NewBldVec[j]) {
1869        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1870        RemapSwizzle[i] = j;
1871        break;
1872      }
1873    }
1874  }
1875
1876  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1877                            NewBldVec);
1878}
1879
1880static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1881                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1882  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1883  assert(RemapSwizzle.empty());
1884  SDValue NewBldVec[4] = {
1885      VectorEntry.getOperand(0),
1886      VectorEntry.getOperand(1),
1887      VectorEntry.getOperand(2),
1888      VectorEntry.getOperand(3)
1889  };
1890  bool isUnmovable[4] = { false, false, false, false };
1891  for (unsigned i = 0; i < 4; i++) {
1892    RemapSwizzle[i] = i;
1893    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1894      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1895          ->getZExtValue();
1896      if (i == Idx)
1897        isUnmovable[Idx] = true;
1898    }
1899  }
1900
1901  for (unsigned i = 0; i < 4; i++) {
1902    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1903      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1904          ->getZExtValue();
1905      if (isUnmovable[Idx])
1906        continue;
1907      // Swap i and Idx
1908      std::swap(NewBldVec[Idx], NewBldVec[i]);
1909      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1910      break;
1911    }
1912  }
1913
1914  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1915                            NewBldVec);
1916}
1917
1918SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1919                                            SelectionDAG &DAG,
1920                                            const SDLoc &DL) const {
1921  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1922  // Old -> New swizzle values
1923  DenseMap<unsigned, unsigned> SwizzleRemap;
1924
1925  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1926  for (unsigned i = 0; i < 4; i++) {
1927    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1928    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1929      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1930  }
1931
1932  SwizzleRemap.clear();
1933  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1934  for (unsigned i = 0; i < 4; i++) {
1935    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1936    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1937      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1938  }
1939
1940  return BuildVector;
1941}
1942
1943
1944//===----------------------------------------------------------------------===//
1945// Custom DAG Optimizations
1946//===----------------------------------------------------------------------===//
1947
1948SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1949                                              DAGCombinerInfo &DCI) const {
1950  SelectionDAG &DAG = DCI.DAG;
1951
1952  switch (N->getOpcode()) {
1953  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1954  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1955  case ISD::FP_ROUND: {
1956      SDValue Arg = N->getOperand(0);
1957      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1958        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1959                           Arg.getOperand(0));
1960      }
1961      break;
1962    }
1963
1964  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1965  // (i32 select_cc f32, f32, -1, 0 cc)
1966  //
1967  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1968  // this to one of the SET*_DX10 instructions.
1969  case ISD::FP_TO_SINT: {
1970    SDValue FNeg = N->getOperand(0);
1971    if (FNeg.getOpcode() != ISD::FNEG) {
1972      return SDValue();
1973    }
1974    SDValue SelectCC = FNeg.getOperand(0);
1975    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1976        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1977        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1978        !isHWTrueValue(SelectCC.getOperand(2)) ||
1979        !isHWFalseValue(SelectCC.getOperand(3))) {
1980      return SDValue();
1981    }
1982
1983    SDLoc dl(N);
1984    return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1985                           SelectCC.getOperand(0), // LHS
1986                           SelectCC.getOperand(1), // RHS
1987                           DAG.getConstant(-1, dl, MVT::i32), // True
1988                           DAG.getConstant(0, dl, MVT::i32),  // False
1989                           SelectCC.getOperand(4)); // CC
1990
1991    break;
1992  }
1993
1994  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1995  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1996  case ISD::INSERT_VECTOR_ELT: {
1997    SDValue InVec = N->getOperand(0);
1998    SDValue InVal = N->getOperand(1);
1999    SDValue EltNo = N->getOperand(2);
2000    SDLoc dl(N);
2001
2002    // If the inserted element is an UNDEF, just use the input vector.
2003    if (InVal.isUndef())
2004      return InVec;
2005
2006    EVT VT = InVec.getValueType();
2007
2008    // If we can't generate a legal BUILD_VECTOR, exit
2009    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2010      return SDValue();
2011
2012    // Check that we know which element is being inserted
2013    if (!isa<ConstantSDNode>(EltNo))
2014      return SDValue();
2015    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2016
2017    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2018    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
2019    // vector elements.
2020    SmallVector<SDValue, 8> Ops;
2021    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2022      Ops.append(InVec.getNode()->op_begin(),
2023                 InVec.getNode()->op_end());
2024    } else if (InVec.isUndef()) {
2025      unsigned NElts = VT.getVectorNumElements();
2026      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2027    } else {
2028      return SDValue();
2029    }
2030
2031    // Insert the element
2032    if (Elt < Ops.size()) {
2033      // All the operands of BUILD_VECTOR must have the same type;
2034      // we enforce that here.
2035      EVT OpVT = Ops[0].getValueType();
2036      if (InVal.getValueType() != OpVT)
2037        InVal = OpVT.bitsGT(InVal.getValueType()) ?
2038          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2039          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2040      Ops[Elt] = InVal;
2041    }
2042
2043    // Return the new vector
2044    return DAG.getBuildVector(VT, dl, Ops);
2045  }
2046
2047  // Extract_vec (Build_vector) generated by custom lowering
2048  // also needs to be customly combined
2049  case ISD::EXTRACT_VECTOR_ELT: {
2050    SDValue Arg = N->getOperand(0);
2051    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2052      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2053        unsigned Element = Const->getZExtValue();
2054        return Arg->getOperand(Element);
2055      }
2056    }
2057    if (Arg.getOpcode() == ISD::BITCAST &&
2058        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2059      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2060        unsigned Element = Const->getZExtValue();
2061        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2062            Arg->getOperand(0).getOperand(Element));
2063      }
2064    }
2065    break;
2066  }
2067
2068  case ISD::SELECT_CC: {
2069    // Try common optimizations
2070    if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2071      return Ret;
2072
2073    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2074    //      selectcc x, y, a, b, inv(cc)
2075    //
2076    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2077    //      selectcc x, y, a, b, cc
2078    SDValue LHS = N->getOperand(0);
2079    if (LHS.getOpcode() != ISD::SELECT_CC) {
2080      return SDValue();
2081    }
2082
2083    SDValue RHS = N->getOperand(1);
2084    SDValue True = N->getOperand(2);
2085    SDValue False = N->getOperand(3);
2086    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2087
2088    if (LHS.getOperand(2).getNode() != True.getNode() ||
2089        LHS.getOperand(3).getNode() != False.getNode() ||
2090        RHS.getNode() != False.getNode()) {
2091      return SDValue();
2092    }
2093
2094    switch (NCC) {
2095    default: return SDValue();
2096    case ISD::SETNE: return LHS;
2097    case ISD::SETEQ: {
2098      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2099      LHSCC = ISD::getSetCCInverse(LHSCC,
2100                                  LHS.getOperand(0).getValueType().isInteger());
2101      if (DCI.isBeforeLegalizeOps() ||
2102          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2103        return DAG.getSelectCC(SDLoc(N),
2104                               LHS.getOperand(0),
2105                               LHS.getOperand(1),
2106                               LHS.getOperand(2),
2107                               LHS.getOperand(3),
2108                               LHSCC);
2109      break;
2110    }
2111    }
2112    return SDValue();
2113  }
2114
2115  case AMDGPUISD::EXPORT: {
2116    SDValue Arg = N->getOperand(1);
2117    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2118      break;
2119
2120    SDValue NewArgs[8] = {
2121      N->getOperand(0), // Chain
2122      SDValue(),
2123      N->getOperand(2), // ArrayBase
2124      N->getOperand(3), // Type
2125      N->getOperand(4), // SWZ_X
2126      N->getOperand(5), // SWZ_Y
2127      N->getOperand(6), // SWZ_Z
2128      N->getOperand(7) // SWZ_W
2129    };
2130    SDLoc DL(N);
2131    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2132    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2133  }
2134  case AMDGPUISD::TEXTURE_FETCH: {
2135    SDValue Arg = N->getOperand(1);
2136    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2137      break;
2138
2139    SDValue NewArgs[19] = {
2140      N->getOperand(0),
2141      N->getOperand(1),
2142      N->getOperand(2),
2143      N->getOperand(3),
2144      N->getOperand(4),
2145      N->getOperand(5),
2146      N->getOperand(6),
2147      N->getOperand(7),
2148      N->getOperand(8),
2149      N->getOperand(9),
2150      N->getOperand(10),
2151      N->getOperand(11),
2152      N->getOperand(12),
2153      N->getOperand(13),
2154      N->getOperand(14),
2155      N->getOperand(15),
2156      N->getOperand(16),
2157      N->getOperand(17),
2158      N->getOperand(18),
2159    };
2160    SDLoc DL(N);
2161    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2162    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2163  }
2164  }
2165
2166  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2167}
2168
2169bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2170                                     SDValue &Src, SDValue &Neg, SDValue &Abs,
2171                                     SDValue &Sel, SDValue &Imm,
2172                                     SelectionDAG &DAG) const {
2173  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2174  if (!Src.isMachineOpcode())
2175    return false;
2176
2177  switch (Src.getMachineOpcode()) {
2178  case AMDGPU::FNEG_R600:
2179    if (!Neg.getNode())
2180      return false;
2181    Src = Src.getOperand(0);
2182    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2183    return true;
2184  case AMDGPU::FABS_R600:
2185    if (!Abs.getNode())
2186      return false;
2187    Src = Src.getOperand(0);
2188    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2189    return true;
2190  case AMDGPU::CONST_COPY: {
2191    unsigned Opcode = ParentNode->getMachineOpcode();
2192    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2193
2194    if (!Sel.getNode())
2195      return false;
2196
2197    SDValue CstOffset = Src.getOperand(0);
2198    if (ParentNode->getValueType(0).isVector())
2199      return false;
2200
2201    // Gather constants values
2202    int SrcIndices[] = {
2203      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2204      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2205      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2206      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2207      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2208      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2209      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2210      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2211      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2212      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2213      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2214    };
2215    std::vector<unsigned> Consts;
2216    for (int OtherSrcIdx : SrcIndices) {
2217      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2218      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2219        continue;
2220      if (HasDst) {
2221        OtherSrcIdx--;
2222        OtherSelIdx--;
2223      }
2224      if (RegisterSDNode *Reg =
2225          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2226        if (Reg->getReg() == AMDGPU::ALU_CONST) {
2227          ConstantSDNode *Cst
2228            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2229          Consts.push_back(Cst->getZExtValue());
2230        }
2231      }
2232    }
2233
2234    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2235    Consts.push_back(Cst->getZExtValue());
2236    if (!TII->fitsConstReadLimitations(Consts)) {
2237      return false;
2238    }
2239
2240    Sel = CstOffset;
2241    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2242    return true;
2243  }
2244  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2245    // Check if the Imm slot is used. Taken from below.
2246    if (cast<ConstantSDNode>(Imm)->getZExtValue())
2247      return false;
2248    Imm = Src.getOperand(0);
2249    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2250    return true;
2251  case AMDGPU::MOV_IMM_I32:
2252  case AMDGPU::MOV_IMM_F32: {
2253    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2254    uint64_t ImmValue = 0;
2255
2256
2257    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2258      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2259      float FloatValue = FPC->getValueAPF().convertToFloat();
2260      if (FloatValue == 0.0) {
2261        ImmReg = AMDGPU::ZERO;
2262      } else if (FloatValue == 0.5) {
2263        ImmReg = AMDGPU::HALF;
2264      } else if (FloatValue == 1.0) {
2265        ImmReg = AMDGPU::ONE;
2266      } else {
2267        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2268      }
2269    } else {
2270      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2271      uint64_t Value = C->getZExtValue();
2272      if (Value == 0) {
2273        ImmReg = AMDGPU::ZERO;
2274      } else if (Value == 1) {
2275        ImmReg = AMDGPU::ONE_INT;
2276      } else {
2277        ImmValue = Value;
2278      }
2279    }
2280
2281    // Check that we aren't already using an immediate.
2282    // XXX: It's possible for an instruction to have more than one
2283    // immediate operand, but this is not supported yet.
2284    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2285      if (!Imm.getNode())
2286        return false;
2287      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2288      assert(C);
2289      if (C->getZExtValue())
2290        return false;
2291      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2292    }
2293    Src = DAG.getRegister(ImmReg, MVT::i32);
2294    return true;
2295  }
2296  default:
2297    return false;
2298  }
2299}
2300
2301/// \brief Fold the instructions after selecting them
2302SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2303                                            SelectionDAG &DAG) const {
2304  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2305  if (!Node->isMachineOpcode())
2306    return Node;
2307
2308  unsigned Opcode = Node->getMachineOpcode();
2309  SDValue FakeOp;
2310
2311  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2312
2313  if (Opcode == AMDGPU::DOT_4) {
2314    int OperandIdx[] = {
2315      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2316      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2317      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2318      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2319      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2320      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2321      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2322      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2323        };
2324    int NegIdx[] = {
2325      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2326      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2327      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2328      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2329      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2330      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2331      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2332      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2333    };
2334    int AbsIdx[] = {
2335      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2336      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2337      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2338      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2339      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2340      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2341      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2342      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2343    };
2344    for (unsigned i = 0; i < 8; i++) {
2345      if (OperandIdx[i] < 0)
2346        return Node;
2347      SDValue &Src = Ops[OperandIdx[i] - 1];
2348      SDValue &Neg = Ops[NegIdx[i] - 1];
2349      SDValue &Abs = Ops[AbsIdx[i] - 1];
2350      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2351      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2352      if (HasDst)
2353        SelIdx--;
2354      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2355      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2356        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2357    }
2358  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2359    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2360      SDValue &Src = Ops[i];
2361      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2362        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2363    }
2364  } else if (Opcode == AMDGPU::CLAMP_R600) {
2365    SDValue Src = Node->getOperand(0);
2366    if (!Src.isMachineOpcode() ||
2367        !TII->hasInstrModifiers(Src.getMachineOpcode()))
2368      return Node;
2369    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2370        AMDGPU::OpName::clamp);
2371    if (ClampIdx < 0)
2372      return Node;
2373    SDLoc DL(Node);
2374    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2375    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2376    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2377                              Node->getVTList(), Ops);
2378  } else {
2379    if (!TII->hasInstrModifiers(Opcode))
2380      return Node;
2381    int OperandIdx[] = {
2382      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2383      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2384      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2385    };
2386    int NegIdx[] = {
2387      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2388      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2389      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2390    };
2391    int AbsIdx[] = {
2392      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2393      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2394      -1
2395    };
2396    for (unsigned i = 0; i < 3; i++) {
2397      if (OperandIdx[i] < 0)
2398        return Node;
2399      SDValue &Src = Ops[OperandIdx[i] - 1];
2400      SDValue &Neg = Ops[NegIdx[i] - 1];
2401      SDValue FakeAbs;
2402      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2403      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2404      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2405      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2406      if (HasDst) {
2407        SelIdx--;
2408        ImmIdx--;
2409      }
2410      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2411      SDValue &Imm = Ops[ImmIdx];
2412      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2413        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2414    }
2415  }
2416
2417  return Node;
2418}
2419