1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "AMDGPUFrameLowering.h"
17#include "AMDGPUIntrinsicInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "R600Defines.h"
20#include "R600InstrInfo.h"
21#include "R600MachineFunctionInfo.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/IR/Argument.h"
29#include "llvm/IR/Function.h"
30
31using namespace llvm;
32
33R600TargetLowering::R600TargetLowering(TargetMachine &TM,
34                                       const AMDGPUSubtarget &STI)
35    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42
43  computeRegisterProperties(STI.getRegisterInfo());
44
45  // Set condition code actions
46  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
47  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
48  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
49  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
50  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58
59  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63
64  setOperationAction(ISD::FCOS, MVT::f32, Custom);
65  setOperationAction(ISD::FSIN, MVT::f32, Custom);
66
67  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69
70  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73
74  setOperationAction(ISD::FSUB, MVT::f32, Expand);
75
76  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79
80  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82
83  setOperationAction(ISD::SETCC, MVT::i32, Expand);
84  setOperationAction(ISD::SETCC, MVT::f32, Expand);
85  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88
89  setOperationAction(ISD::SELECT, MVT::i32, Expand);
90  setOperationAction(ISD::SELECT, MVT::f32, Expand);
91  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93
94  // Expand sign extension of vectors
95  if (!Subtarget->hasBFE())
96    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
97
98  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
99  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
100
101  if (!Subtarget->hasBFE())
102    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
103  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
104  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
105
106  if (!Subtarget->hasBFE())
107    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
108  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
109  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
110
111  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
112  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
113  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
114
115  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
116
117
118  // Legalize loads and stores to the private address space.
119  setOperationAction(ISD::LOAD, MVT::i32, Custom);
120  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
121  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
122
123  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
124  // spaces, so it is custom lowered to handle those where it isn't.
125  for (MVT VT : MVT::integer_valuetypes()) {
126    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
127    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
128    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
129
130    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
131    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
132    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
133
134    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
135    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
136    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
137  }
138
139  setOperationAction(ISD::STORE, MVT::i8, Custom);
140  setOperationAction(ISD::STORE, MVT::i32, Custom);
141  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
142  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
143  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
144  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
145
146  setOperationAction(ISD::LOAD, MVT::i32, Custom);
147  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
148  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
149
150  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
151  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
152  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
153  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
154
155  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
156  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
157  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
158  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
159
160  setTargetDAGCombine(ISD::FP_ROUND);
161  setTargetDAGCombine(ISD::FP_TO_SINT);
162  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
163  setTargetDAGCombine(ISD::SELECT_CC);
164  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
165
166  setOperationAction(ISD::SUB, MVT::i64, Expand);
167
168  // These should be replaced by UDVIREM, but it does not happen automatically
169  // during Type Legalization
170  setOperationAction(ISD::UDIV, MVT::i64, Custom);
171  setOperationAction(ISD::UREM, MVT::i64, Custom);
172  setOperationAction(ISD::SDIV, MVT::i64, Custom);
173  setOperationAction(ISD::SREM, MVT::i64, Custom);
174
175  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
176  //  to be Legal/Custom in order to avoid library calls.
177  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
178  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
179  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
180
181  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
182
183  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
184  for (MVT VT : ScalarIntVTs) {
185    setOperationAction(ISD::ADDC, VT, Expand);
186    setOperationAction(ISD::SUBC, VT, Expand);
187    setOperationAction(ISD::ADDE, VT, Expand);
188    setOperationAction(ISD::SUBE, VT, Expand);
189  }
190
191  setSchedulingPreference(Sched::Source);
192}
193
194MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
195    MachineInstr * MI, MachineBasicBlock * BB) const {
196  MachineFunction * MF = BB->getParent();
197  MachineRegisterInfo &MRI = MF->getRegInfo();
198  MachineBasicBlock::iterator I = *MI;
199  const R600InstrInfo *TII =
200      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
201
202  switch (MI->getOpcode()) {
203  default:
204    // Replace LDS_*_RET instruction that don't have any uses with the
205    // equivalent LDS_*_NORET instruction.
206    if (TII->isLDSRetInstr(MI->getOpcode())) {
207      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
208      assert(DstIdx != -1);
209      MachineInstrBuilder NewMI;
210      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
211      //        LDS_1A2D support and remove this special case.
212      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
213           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
214        return BB;
215
216      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
217                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
218      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
219        NewMI.addOperand(MI->getOperand(i));
220      }
221    } else {
222      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
223    }
224    break;
225  case AMDGPU::CLAMP_R600: {
226    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
227                                                   AMDGPU::MOV,
228                                                   MI->getOperand(0).getReg(),
229                                                   MI->getOperand(1).getReg());
230    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
231    break;
232  }
233
234  case AMDGPU::FABS_R600: {
235    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
236                                                    AMDGPU::MOV,
237                                                    MI->getOperand(0).getReg(),
238                                                    MI->getOperand(1).getReg());
239    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
240    break;
241  }
242
243  case AMDGPU::FNEG_R600: {
244    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
245                                                    AMDGPU::MOV,
246                                                    MI->getOperand(0).getReg(),
247                                                    MI->getOperand(1).getReg());
248    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
249    break;
250  }
251
252  case AMDGPU::MASK_WRITE: {
253    unsigned maskedRegister = MI->getOperand(0).getReg();
254    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
255    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
256    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
257    break;
258  }
259
260  case AMDGPU::MOV_IMM_F32:
261    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
262                     MI->getOperand(1).getFPImm()->getValueAPF()
263                         .bitcastToAPInt().getZExtValue());
264    break;
265  case AMDGPU::MOV_IMM_I32:
266    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
267                     MI->getOperand(1).getImm());
268    break;
269  case AMDGPU::CONST_COPY: {
270    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
271        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
272    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
273        MI->getOperand(1).getImm());
274    break;
275  }
276
277  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
278  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
279  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
280    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
281
282    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
283            .addOperand(MI->getOperand(0))
284            .addOperand(MI->getOperand(1))
285            .addImm(EOP); // Set End of program bit
286    break;
287  }
288
289  case AMDGPU::TXD: {
290    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
291    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
292    MachineOperand &RID = MI->getOperand(4);
293    MachineOperand &SID = MI->getOperand(5);
294    unsigned TextureId = MI->getOperand(6).getImm();
295    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
296    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
297
298    switch (TextureId) {
299    case 5: // Rect
300      CTX = CTY = 0;
301      break;
302    case 6: // Shadow1D
303      SrcW = SrcZ;
304      break;
305    case 7: // Shadow2D
306      SrcW = SrcZ;
307      break;
308    case 8: // ShadowRect
309      CTX = CTY = 0;
310      SrcW = SrcZ;
311      break;
312    case 9: // 1DArray
313      SrcZ = SrcY;
314      CTZ = 0;
315      break;
316    case 10: // 2DArray
317      CTZ = 0;
318      break;
319    case 11: // Shadow1DArray
320      SrcZ = SrcY;
321      CTZ = 0;
322      break;
323    case 12: // Shadow2DArray
324      CTZ = 0;
325      break;
326    }
327    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
328            .addOperand(MI->getOperand(3))
329            .addImm(SrcX)
330            .addImm(SrcY)
331            .addImm(SrcZ)
332            .addImm(SrcW)
333            .addImm(0)
334            .addImm(0)
335            .addImm(0)
336            .addImm(0)
337            .addImm(1)
338            .addImm(2)
339            .addImm(3)
340            .addOperand(RID)
341            .addOperand(SID)
342            .addImm(CTX)
343            .addImm(CTY)
344            .addImm(CTZ)
345            .addImm(CTW);
346    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
347            .addOperand(MI->getOperand(2))
348            .addImm(SrcX)
349            .addImm(SrcY)
350            .addImm(SrcZ)
351            .addImm(SrcW)
352            .addImm(0)
353            .addImm(0)
354            .addImm(0)
355            .addImm(0)
356            .addImm(1)
357            .addImm(2)
358            .addImm(3)
359            .addOperand(RID)
360            .addOperand(SID)
361            .addImm(CTX)
362            .addImm(CTY)
363            .addImm(CTZ)
364            .addImm(CTW);
365    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
366            .addOperand(MI->getOperand(0))
367            .addOperand(MI->getOperand(1))
368            .addImm(SrcX)
369            .addImm(SrcY)
370            .addImm(SrcZ)
371            .addImm(SrcW)
372            .addImm(0)
373            .addImm(0)
374            .addImm(0)
375            .addImm(0)
376            .addImm(1)
377            .addImm(2)
378            .addImm(3)
379            .addOperand(RID)
380            .addOperand(SID)
381            .addImm(CTX)
382            .addImm(CTY)
383            .addImm(CTZ)
384            .addImm(CTW)
385            .addReg(T0, RegState::Implicit)
386            .addReg(T1, RegState::Implicit);
387    break;
388  }
389
390  case AMDGPU::TXD_SHADOW: {
391    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
392    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
393    MachineOperand &RID = MI->getOperand(4);
394    MachineOperand &SID = MI->getOperand(5);
395    unsigned TextureId = MI->getOperand(6).getImm();
396    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
397    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
398
399    switch (TextureId) {
400    case 5: // Rect
401      CTX = CTY = 0;
402      break;
403    case 6: // Shadow1D
404      SrcW = SrcZ;
405      break;
406    case 7: // Shadow2D
407      SrcW = SrcZ;
408      break;
409    case 8: // ShadowRect
410      CTX = CTY = 0;
411      SrcW = SrcZ;
412      break;
413    case 9: // 1DArray
414      SrcZ = SrcY;
415      CTZ = 0;
416      break;
417    case 10: // 2DArray
418      CTZ = 0;
419      break;
420    case 11: // Shadow1DArray
421      SrcZ = SrcY;
422      CTZ = 0;
423      break;
424    case 12: // Shadow2DArray
425      CTZ = 0;
426      break;
427    }
428
429    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
430            .addOperand(MI->getOperand(3))
431            .addImm(SrcX)
432            .addImm(SrcY)
433            .addImm(SrcZ)
434            .addImm(SrcW)
435            .addImm(0)
436            .addImm(0)
437            .addImm(0)
438            .addImm(0)
439            .addImm(1)
440            .addImm(2)
441            .addImm(3)
442            .addOperand(RID)
443            .addOperand(SID)
444            .addImm(CTX)
445            .addImm(CTY)
446            .addImm(CTZ)
447            .addImm(CTW);
448    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
449            .addOperand(MI->getOperand(2))
450            .addImm(SrcX)
451            .addImm(SrcY)
452            .addImm(SrcZ)
453            .addImm(SrcW)
454            .addImm(0)
455            .addImm(0)
456            .addImm(0)
457            .addImm(0)
458            .addImm(1)
459            .addImm(2)
460            .addImm(3)
461            .addOperand(RID)
462            .addOperand(SID)
463            .addImm(CTX)
464            .addImm(CTY)
465            .addImm(CTZ)
466            .addImm(CTW);
467    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
468            .addOperand(MI->getOperand(0))
469            .addOperand(MI->getOperand(1))
470            .addImm(SrcX)
471            .addImm(SrcY)
472            .addImm(SrcZ)
473            .addImm(SrcW)
474            .addImm(0)
475            .addImm(0)
476            .addImm(0)
477            .addImm(0)
478            .addImm(1)
479            .addImm(2)
480            .addImm(3)
481            .addOperand(RID)
482            .addOperand(SID)
483            .addImm(CTX)
484            .addImm(CTY)
485            .addImm(CTZ)
486            .addImm(CTW)
487            .addReg(T0, RegState::Implicit)
488            .addReg(T1, RegState::Implicit);
489    break;
490  }
491
492  case AMDGPU::BRANCH:
493      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
494              .addOperand(MI->getOperand(0));
495      break;
496
497  case AMDGPU::BRANCH_COND_f32: {
498    MachineInstr *NewMI =
499      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
500              AMDGPU::PREDICATE_BIT)
501              .addOperand(MI->getOperand(1))
502              .addImm(OPCODE_IS_NOT_ZERO)
503              .addImm(0); // Flags
504    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
505    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
506            .addOperand(MI->getOperand(0))
507            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
508    break;
509  }
510
511  case AMDGPU::BRANCH_COND_i32: {
512    MachineInstr *NewMI =
513      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
514            AMDGPU::PREDICATE_BIT)
515            .addOperand(MI->getOperand(1))
516            .addImm(OPCODE_IS_NOT_ZERO_INT)
517            .addImm(0); // Flags
518    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
519    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
520           .addOperand(MI->getOperand(0))
521            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
522    break;
523  }
524
525  case AMDGPU::EG_ExportSwz:
526  case AMDGPU::R600_ExportSwz: {
527    // Instruction is left unmodified if its not the last one of its type
528    bool isLastInstructionOfItsType = true;
529    unsigned InstExportType = MI->getOperand(1).getImm();
530    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
531         EndBlock = BB->end(); NextExportInst != EndBlock;
532         NextExportInst = std::next(NextExportInst)) {
533      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
534          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
535        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
536            .getImm();
537        if (CurrentInstExportType == InstExportType) {
538          isLastInstructionOfItsType = false;
539          break;
540        }
541      }
542    }
543    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
544    if (!EOP && !isLastInstructionOfItsType)
545      return BB;
546    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
547    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
548            .addOperand(MI->getOperand(0))
549            .addOperand(MI->getOperand(1))
550            .addOperand(MI->getOperand(2))
551            .addOperand(MI->getOperand(3))
552            .addOperand(MI->getOperand(4))
553            .addOperand(MI->getOperand(5))
554            .addOperand(MI->getOperand(6))
555            .addImm(CfInst)
556            .addImm(EOP);
557    break;
558  }
559  case AMDGPU::RETURN: {
560    // RETURN instructions must have the live-out registers as implicit uses,
561    // otherwise they appear dead.
562    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
563    MachineInstrBuilder MIB(*MF, MI);
564    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
565      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
566    return BB;
567  }
568  }
569
570  MI->eraseFromParent();
571  return BB;
572}
573
574//===----------------------------------------------------------------------===//
575// Custom DAG Lowering Operations
576//===----------------------------------------------------------------------===//
577
578SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
579  MachineFunction &MF = DAG.getMachineFunction();
580  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
581  switch (Op.getOpcode()) {
582  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
583  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
584  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
585  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
586  case ISD::SRA_PARTS:
587  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
588  case ISD::FCOS:
589  case ISD::FSIN: return LowerTrig(Op, DAG);
590  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
591  case ISD::STORE: return LowerSTORE(Op, DAG);
592  case ISD::LOAD: {
593    SDValue Result = LowerLOAD(Op, DAG);
594    assert((!Result.getNode() ||
595            Result.getNode()->getNumValues() == 2) &&
596           "Load should return a value and a chain");
597    return Result;
598  }
599
600  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
601  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
602  case ISD::INTRINSIC_VOID: {
603    SDValue Chain = Op.getOperand(0);
604    unsigned IntrinsicID =
605                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606    switch (IntrinsicID) {
607    case AMDGPUIntrinsic::AMDGPU_store_output: {
608      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
609      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
610      MFI->LiveOuts.push_back(Reg);
611      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
612    }
613    case AMDGPUIntrinsic::R600_store_swizzle: {
614      const SDValue Args[8] = {
615        Chain,
616        Op.getOperand(2), // Export Value
617        Op.getOperand(3), // ArrayBase
618        Op.getOperand(4), // Type
619        DAG.getConstant(0, MVT::i32), // SWZ_X
620        DAG.getConstant(1, MVT::i32), // SWZ_Y
621        DAG.getConstant(2, MVT::i32), // SWZ_Z
622        DAG.getConstant(3, MVT::i32) // SWZ_W
623      };
624      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
625    }
626
627    // default for switch(IntrinsicID)
628    default: break;
629    }
630    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
631    break;
632  }
633  case ISD::INTRINSIC_WO_CHAIN: {
634    unsigned IntrinsicID =
635                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
636    EVT VT = Op.getValueType();
637    SDLoc DL(Op);
638    switch(IntrinsicID) {
639    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
640    case AMDGPUIntrinsic::R600_load_input: {
641      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
642      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
643      MachineFunction &MF = DAG.getMachineFunction();
644      MachineRegisterInfo &MRI = MF.getRegInfo();
645      MRI.addLiveIn(Reg);
646      return DAG.getCopyFromReg(DAG.getEntryNode(),
647          SDLoc(DAG.getEntryNode()), Reg, VT);
648    }
649
650    case AMDGPUIntrinsic::R600_interp_input: {
651      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
652      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
653      MachineSDNode *interp;
654      if (ijb < 0) {
655        const R600InstrInfo *TII =
656            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
657        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
658            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
659        return DAG.getTargetExtractSubreg(
660            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
661            DL, MVT::f32, SDValue(interp, 0));
662      }
663      MachineFunction &MF = DAG.getMachineFunction();
664      MachineRegisterInfo &MRI = MF.getRegInfo();
665      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
666      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
667      MRI.addLiveIn(RegisterI);
668      MRI.addLiveIn(RegisterJ);
669      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
670          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
671      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
672          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
673
674      if (slot % 4 < 2)
675        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
676            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
677            RegisterJNode, RegisterINode);
678      else
679        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
680            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
681            RegisterJNode, RegisterINode);
682      return SDValue(interp, slot % 2);
683    }
684    case AMDGPUIntrinsic::R600_interp_xy:
685    case AMDGPUIntrinsic::R600_interp_zw: {
686      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
687      MachineSDNode *interp;
688      SDValue RegisterINode = Op.getOperand(2);
689      SDValue RegisterJNode = Op.getOperand(3);
690
691      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
692        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
693            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
694            RegisterJNode, RegisterINode);
695      else
696        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
697            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
698            RegisterJNode, RegisterINode);
699      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
700          SDValue(interp, 0), SDValue(interp, 1));
701    }
702    case AMDGPUIntrinsic::R600_tex:
703    case AMDGPUIntrinsic::R600_texc:
704    case AMDGPUIntrinsic::R600_txl:
705    case AMDGPUIntrinsic::R600_txlc:
706    case AMDGPUIntrinsic::R600_txb:
707    case AMDGPUIntrinsic::R600_txbc:
708    case AMDGPUIntrinsic::R600_txf:
709    case AMDGPUIntrinsic::R600_txq:
710    case AMDGPUIntrinsic::R600_ddx:
711    case AMDGPUIntrinsic::R600_ddy:
712    case AMDGPUIntrinsic::R600_ldptr: {
713      unsigned TextureOp;
714      switch (IntrinsicID) {
715      case AMDGPUIntrinsic::R600_tex:
716        TextureOp = 0;
717        break;
718      case AMDGPUIntrinsic::R600_texc:
719        TextureOp = 1;
720        break;
721      case AMDGPUIntrinsic::R600_txl:
722        TextureOp = 2;
723        break;
724      case AMDGPUIntrinsic::R600_txlc:
725        TextureOp = 3;
726        break;
727      case AMDGPUIntrinsic::R600_txb:
728        TextureOp = 4;
729        break;
730      case AMDGPUIntrinsic::R600_txbc:
731        TextureOp = 5;
732        break;
733      case AMDGPUIntrinsic::R600_txf:
734        TextureOp = 6;
735        break;
736      case AMDGPUIntrinsic::R600_txq:
737        TextureOp = 7;
738        break;
739      case AMDGPUIntrinsic::R600_ddx:
740        TextureOp = 8;
741        break;
742      case AMDGPUIntrinsic::R600_ddy:
743        TextureOp = 9;
744        break;
745      case AMDGPUIntrinsic::R600_ldptr:
746        TextureOp = 10;
747        break;
748      default:
749        llvm_unreachable("Unknow Texture Operation");
750      }
751
752      SDValue TexArgs[19] = {
753        DAG.getConstant(TextureOp, MVT::i32),
754        Op.getOperand(1),
755        DAG.getConstant(0, MVT::i32),
756        DAG.getConstant(1, MVT::i32),
757        DAG.getConstant(2, MVT::i32),
758        DAG.getConstant(3, MVT::i32),
759        Op.getOperand(2),
760        Op.getOperand(3),
761        Op.getOperand(4),
762        DAG.getConstant(0, MVT::i32),
763        DAG.getConstant(1, MVT::i32),
764        DAG.getConstant(2, MVT::i32),
765        DAG.getConstant(3, MVT::i32),
766        Op.getOperand(5),
767        Op.getOperand(6),
768        Op.getOperand(7),
769        Op.getOperand(8),
770        Op.getOperand(9),
771        Op.getOperand(10)
772      };
773      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
774    }
775    case AMDGPUIntrinsic::AMDGPU_dp4: {
776      SDValue Args[8] = {
777      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
778          DAG.getConstant(0, MVT::i32)),
779      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
780          DAG.getConstant(0, MVT::i32)),
781      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
782          DAG.getConstant(1, MVT::i32)),
783      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
784          DAG.getConstant(1, MVT::i32)),
785      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
786          DAG.getConstant(2, MVT::i32)),
787      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
788          DAG.getConstant(2, MVT::i32)),
789      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
790          DAG.getConstant(3, MVT::i32)),
791      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
792          DAG.getConstant(3, MVT::i32))
793      };
794      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
795    }
796
797    case Intrinsic::r600_read_ngroups_x:
798      return LowerImplicitParameter(DAG, VT, DL, 0);
799    case Intrinsic::r600_read_ngroups_y:
800      return LowerImplicitParameter(DAG, VT, DL, 1);
801    case Intrinsic::r600_read_ngroups_z:
802      return LowerImplicitParameter(DAG, VT, DL, 2);
803    case Intrinsic::r600_read_global_size_x:
804      return LowerImplicitParameter(DAG, VT, DL, 3);
805    case Intrinsic::r600_read_global_size_y:
806      return LowerImplicitParameter(DAG, VT, DL, 4);
807    case Intrinsic::r600_read_global_size_z:
808      return LowerImplicitParameter(DAG, VT, DL, 5);
809    case Intrinsic::r600_read_local_size_x:
810      return LowerImplicitParameter(DAG, VT, DL, 6);
811    case Intrinsic::r600_read_local_size_y:
812      return LowerImplicitParameter(DAG, VT, DL, 7);
813    case Intrinsic::r600_read_local_size_z:
814      return LowerImplicitParameter(DAG, VT, DL, 8);
815
816    case Intrinsic::AMDGPU_read_workdim:
817      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
818
819    case Intrinsic::r600_read_tgid_x:
820      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
821                                  AMDGPU::T1_X, VT);
822    case Intrinsic::r600_read_tgid_y:
823      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
824                                  AMDGPU::T1_Y, VT);
825    case Intrinsic::r600_read_tgid_z:
826      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
827                                  AMDGPU::T1_Z, VT);
828    case Intrinsic::r600_read_tidig_x:
829      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
830                                  AMDGPU::T0_X, VT);
831    case Intrinsic::r600_read_tidig_y:
832      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
833                                  AMDGPU::T0_Y, VT);
834    case Intrinsic::r600_read_tidig_z:
835      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
836                                  AMDGPU::T0_Z, VT);
837    case Intrinsic::AMDGPU_rsq:
838      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
839      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
840
841    case AMDGPUIntrinsic::AMDGPU_fract:
842    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
843      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
844    }
845    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
846    break;
847  }
848  } // end switch(Op.getOpcode())
849  return SDValue();
850}
851
852void R600TargetLowering::ReplaceNodeResults(SDNode *N,
853                                            SmallVectorImpl<SDValue> &Results,
854                                            SelectionDAG &DAG) const {
855  switch (N->getOpcode()) {
856  default:
857    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
858    return;
859  case ISD::FP_TO_UINT:
860    if (N->getValueType(0) == MVT::i1) {
861      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
862      return;
863    }
864    // Fall-through. Since we don't care about out of bounds values
865    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
866    // considers some extra cases which are not necessary here.
867  case ISD::FP_TO_SINT: {
868    SDValue Result;
869    if (expandFP_TO_SINT(N, Result, DAG))
870      Results.push_back(Result);
871    return;
872  }
873  case ISD::UDIV: {
874    SDValue Op = SDValue(N, 0);
875    SDLoc DL(Op);
876    EVT VT = Op.getValueType();
877    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
878      N->getOperand(0), N->getOperand(1));
879    Results.push_back(UDIVREM);
880    break;
881  }
882  case ISD::UREM: {
883    SDValue Op = SDValue(N, 0);
884    SDLoc DL(Op);
885    EVT VT = Op.getValueType();
886    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
887      N->getOperand(0), N->getOperand(1));
888    Results.push_back(UDIVREM.getValue(1));
889    break;
890  }
891  case ISD::SDIV: {
892    SDValue Op = SDValue(N, 0);
893    SDLoc DL(Op);
894    EVT VT = Op.getValueType();
895    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
896      N->getOperand(0), N->getOperand(1));
897    Results.push_back(SDIVREM);
898    break;
899  }
900  case ISD::SREM: {
901    SDValue Op = SDValue(N, 0);
902    SDLoc DL(Op);
903    EVT VT = Op.getValueType();
904    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
905      N->getOperand(0), N->getOperand(1));
906    Results.push_back(SDIVREM.getValue(1));
907    break;
908  }
909  case ISD::SDIVREM: {
910    SDValue Op = SDValue(N, 1);
911    SDValue RES = LowerSDIVREM(Op, DAG);
912    Results.push_back(RES);
913    Results.push_back(RES.getValue(1));
914    break;
915  }
916  case ISD::UDIVREM: {
917    SDValue Op = SDValue(N, 0);
918    LowerUDIVREM64(Op, DAG, Results);
919    break;
920  }
921  }
922}
923
924SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
925                                                   SDValue Vector) const {
926
927  SDLoc DL(Vector);
928  EVT VecVT = Vector.getValueType();
929  EVT EltVT = VecVT.getVectorElementType();
930  SmallVector<SDValue, 8> Args;
931
932  for (unsigned i = 0, e = VecVT.getVectorNumElements();
933                                                           i != e; ++i) {
934    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
935                               Vector, DAG.getConstant(i, getVectorIdxTy())));
936  }
937
938  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
939}
940
941SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
942                                                    SelectionDAG &DAG) const {
943
944  SDLoc DL(Op);
945  SDValue Vector = Op.getOperand(0);
946  SDValue Index = Op.getOperand(1);
947
948  if (isa<ConstantSDNode>(Index) ||
949      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
950    return Op;
951
952  Vector = vectorToVerticalVector(DAG, Vector);
953  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
954                     Vector, Index);
955}
956
957SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
958                                                   SelectionDAG &DAG) const {
959  SDLoc DL(Op);
960  SDValue Vector = Op.getOperand(0);
961  SDValue Value = Op.getOperand(1);
962  SDValue Index = Op.getOperand(2);
963
964  if (isa<ConstantSDNode>(Index) ||
965      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
966    return Op;
967
968  Vector = vectorToVerticalVector(DAG, Vector);
969  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
970                               Vector, Value, Index);
971  return vectorToVerticalVector(DAG, Insert);
972}
973
974SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
975  // On hw >= R700, COS/SIN input must be between -1. and 1.
976  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
977  EVT VT = Op.getValueType();
978  SDValue Arg = Op.getOperand(0);
979  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
980      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
981        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
982          DAG.getConstantFP(0.15915494309, MVT::f32)),
983        DAG.getConstantFP(0.5, MVT::f32)));
984  unsigned TrigNode;
985  switch (Op.getOpcode()) {
986  case ISD::FCOS:
987    TrigNode = AMDGPUISD::COS_HW;
988    break;
989  case ISD::FSIN:
990    TrigNode = AMDGPUISD::SIN_HW;
991    break;
992  default:
993    llvm_unreachable("Wrong trig opcode");
994  }
995  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
996      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
997        DAG.getConstantFP(-0.5, MVT::f32)));
998  if (Gen >= AMDGPUSubtarget::R700)
999    return TrigVal;
1000  // On R600 hw, COS/SIN input must be between -Pi and Pi.
1001  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1002      DAG.getConstantFP(3.14159265359, MVT::f32));
1003}
1004
1005SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1006  SDLoc DL(Op);
1007  EVT VT = Op.getValueType();
1008
1009  SDValue Lo = Op.getOperand(0);
1010  SDValue Hi = Op.getOperand(1);
1011  SDValue Shift = Op.getOperand(2);
1012  SDValue Zero = DAG.getConstant(0, VT);
1013  SDValue One  = DAG.getConstant(1, VT);
1014
1015  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1016  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1017  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1018  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1019
1020  // The dance around Width1 is necessary for 0 special case.
1021  // Without it the CompShift might be 32, producing incorrect results in
1022  // Overflow. So we do the shift in two steps, the alternative is to
1023  // add a conditional to filter the special case.
1024
1025  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1026  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1027
1028  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1029  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1030  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1031
1032  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1033  SDValue LoBig = Zero;
1034
1035  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1036  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1037
1038  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1039}
1040
1041SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1042  SDLoc DL(Op);
1043  EVT VT = Op.getValueType();
1044
1045  SDValue Lo = Op.getOperand(0);
1046  SDValue Hi = Op.getOperand(1);
1047  SDValue Shift = Op.getOperand(2);
1048  SDValue Zero = DAG.getConstant(0, VT);
1049  SDValue One  = DAG.getConstant(1, VT);
1050
1051  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1052
1053  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1054  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1055  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1056  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1057
1058  // The dance around Width1 is necessary for 0 special case.
1059  // Without it the CompShift might be 32, producing incorrect results in
1060  // Overflow. So we do the shift in two steps, the alternative is to
1061  // add a conditional to filter the special case.
1062
1063  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1064  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1065
1066  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1067  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1068  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1069
1070  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1071  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1072
1073  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1074  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1075
1076  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1077}
1078
1079SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1080  return DAG.getNode(
1081      ISD::SETCC,
1082      SDLoc(Op),
1083      MVT::i1,
1084      Op, DAG.getConstantFP(0.0f, MVT::f32),
1085      DAG.getCondCode(ISD::SETNE)
1086      );
1087}
1088
1089SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1090                                                   SDLoc DL,
1091                                                   unsigned DwordOffset) const {
1092  unsigned ByteOffset = DwordOffset * 4;
1093  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1094                                      AMDGPUAS::CONSTANT_BUFFER_0);
1095
1096  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1097  assert(isInt<16>(ByteOffset));
1098
1099  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1100                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
1101                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1102                     false, false, false, 0);
1103}
1104
1105bool R600TargetLowering::isZero(SDValue Op) const {
1106  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1107    return Cst->isNullValue();
1108  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1109    return CstFP->isZero();
1110  } else {
1111    return false;
1112  }
1113}
1114
1115SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1116  SDLoc DL(Op);
1117  EVT VT = Op.getValueType();
1118
1119  SDValue LHS = Op.getOperand(0);
1120  SDValue RHS = Op.getOperand(1);
1121  SDValue True = Op.getOperand(2);
1122  SDValue False = Op.getOperand(3);
1123  SDValue CC = Op.getOperand(4);
1124  SDValue Temp;
1125
1126  if (VT == MVT::f32) {
1127    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1128    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1129    if (MinMax)
1130      return MinMax;
1131  }
1132
1133  // LHS and RHS are guaranteed to be the same value type
1134  EVT CompareVT = LHS.getValueType();
1135
1136  // Check if we can lower this to a native operation.
1137
1138  // Try to lower to a SET* instruction:
1139  //
1140  // SET* can match the following patterns:
1141  //
1142  // select_cc f32, f32, -1,  0, cc_supported
1143  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1144  // select_cc i32, i32, -1,  0, cc_supported
1145  //
1146
1147  // Move hardware True/False values to the correct operand.
1148  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1149  ISD::CondCode InverseCC =
1150     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1151  if (isHWTrueValue(False) && isHWFalseValue(True)) {
1152    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1153      std::swap(False, True);
1154      CC = DAG.getCondCode(InverseCC);
1155    } else {
1156      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1157      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1158        std::swap(False, True);
1159        std::swap(LHS, RHS);
1160        CC = DAG.getCondCode(SwapInvCC);
1161      }
1162    }
1163  }
1164
1165  if (isHWTrueValue(True) && isHWFalseValue(False) &&
1166      (CompareVT == VT || VT == MVT::i32)) {
1167    // This can be matched by a SET* instruction.
1168    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1169  }
1170
1171  // Try to lower to a CND* instruction:
1172  //
1173  // CND* can match the following patterns:
1174  //
1175  // select_cc f32, 0.0, f32, f32, cc_supported
1176  // select_cc f32, 0.0, i32, i32, cc_supported
1177  // select_cc i32, 0,   f32, f32, cc_supported
1178  // select_cc i32, 0,   i32, i32, cc_supported
1179  //
1180
1181  // Try to move the zero value to the RHS
1182  if (isZero(LHS)) {
1183    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1184    // Try swapping the operands
1185    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1186    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1187      std::swap(LHS, RHS);
1188      CC = DAG.getCondCode(CCSwapped);
1189    } else {
1190      // Try inverting the conditon and then swapping the operands
1191      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1192      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1193      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1194        std::swap(True, False);
1195        std::swap(LHS, RHS);
1196        CC = DAG.getCondCode(CCSwapped);
1197      }
1198    }
1199  }
1200  if (isZero(RHS)) {
1201    SDValue Cond = LHS;
1202    SDValue Zero = RHS;
1203    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1204    if (CompareVT != VT) {
1205      // Bitcast True / False to the correct types.  This will end up being
1206      // a nop, but it allows us to define only a single pattern in the
1207      // .TD files for each CND* instruction rather than having to have
1208      // one pattern for integer True/False and one for fp True/False
1209      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1210      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1211    }
1212
1213    switch (CCOpcode) {
1214    case ISD::SETONE:
1215    case ISD::SETUNE:
1216    case ISD::SETNE:
1217      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1218      Temp = True;
1219      True = False;
1220      False = Temp;
1221      break;
1222    default:
1223      break;
1224    }
1225    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1226        Cond, Zero,
1227        True, False,
1228        DAG.getCondCode(CCOpcode));
1229    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1230  }
1231
1232  // If we make it this for it means we have no native instructions to handle
1233  // this SELECT_CC, so we must lower it.
1234  SDValue HWTrue, HWFalse;
1235
1236  if (CompareVT == MVT::f32) {
1237    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1238    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1239  } else if (CompareVT == MVT::i32) {
1240    HWTrue = DAG.getConstant(-1, CompareVT);
1241    HWFalse = DAG.getConstant(0, CompareVT);
1242  }
1243  else {
1244    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1245  }
1246
1247  // Lower this unsupported SELECT_CC into a combination of two supported
1248  // SELECT_CC operations.
1249  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1250
1251  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1252      Cond, HWFalse,
1253      True, False,
1254      DAG.getCondCode(ISD::SETNE));
1255}
1256
1257/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1258/// convert these pointers to a register index.  Each register holds
1259/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1260/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1261/// for indirect addressing.
1262SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1263                                               unsigned StackWidth,
1264                                               SelectionDAG &DAG) const {
1265  unsigned SRLPad;
1266  switch(StackWidth) {
1267  case 1:
1268    SRLPad = 2;
1269    break;
1270  case 2:
1271    SRLPad = 3;
1272    break;
1273  case 4:
1274    SRLPad = 4;
1275    break;
1276  default: llvm_unreachable("Invalid stack width");
1277  }
1278
1279  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1280                     DAG.getConstant(SRLPad, MVT::i32));
1281}
1282
1283void R600TargetLowering::getStackAddress(unsigned StackWidth,
1284                                         unsigned ElemIdx,
1285                                         unsigned &Channel,
1286                                         unsigned &PtrIncr) const {
1287  switch (StackWidth) {
1288  default:
1289  case 1:
1290    Channel = 0;
1291    if (ElemIdx > 0) {
1292      PtrIncr = 1;
1293    } else {
1294      PtrIncr = 0;
1295    }
1296    break;
1297  case 2:
1298    Channel = ElemIdx % 2;
1299    if (ElemIdx == 2) {
1300      PtrIncr = 1;
1301    } else {
1302      PtrIncr = 0;
1303    }
1304    break;
1305  case 4:
1306    Channel = ElemIdx;
1307    PtrIncr = 0;
1308    break;
1309  }
1310}
1311
1312SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1313  SDLoc DL(Op);
1314  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1315  SDValue Chain = Op.getOperand(0);
1316  SDValue Value = Op.getOperand(1);
1317  SDValue Ptr = Op.getOperand(2);
1318
1319  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1320  if (Result.getNode()) {
1321    return Result;
1322  }
1323
1324  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1325    if (StoreNode->isTruncatingStore()) {
1326      EVT VT = Value.getValueType();
1327      assert(VT.bitsLE(MVT::i32));
1328      EVT MemVT = StoreNode->getMemoryVT();
1329      SDValue MaskConstant;
1330      if (MemVT == MVT::i8) {
1331        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1332      } else {
1333        assert(MemVT == MVT::i16);
1334        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1335      }
1336      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1337                                      DAG.getConstant(2, MVT::i32));
1338      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1339                                      DAG.getConstant(0x00000003, VT));
1340      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1341      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1342                                   DAG.getConstant(3, VT));
1343      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1344      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1345      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1346      // vector instead.
1347      SDValue Src[4] = {
1348        ShiftedValue,
1349        DAG.getConstant(0, MVT::i32),
1350        DAG.getConstant(0, MVT::i32),
1351        Mask
1352      };
1353      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1354      SDValue Args[3] = { Chain, Input, DWordAddr };
1355      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1356                                     Op->getVTList(), Args, MemVT,
1357                                     StoreNode->getMemOperand());
1358    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1359               Value.getValueType().bitsGE(MVT::i32)) {
1360      // Convert pointer from byte address to dword address.
1361      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1362                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1363                                    Ptr, DAG.getConstant(2, MVT::i32)));
1364
1365      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1366        llvm_unreachable("Truncated and indexed stores not supported yet");
1367      } else {
1368        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1369      }
1370      return Chain;
1371    }
1372  }
1373
1374  EVT ValueVT = Value.getValueType();
1375
1376  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1377    return SDValue();
1378  }
1379
1380  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1381  if (Ret.getNode()) {
1382    return Ret;
1383  }
1384  // Lowering for indirect addressing
1385
1386  const MachineFunction &MF = DAG.getMachineFunction();
1387  const AMDGPUFrameLowering *TFL =
1388      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1389  unsigned StackWidth = TFL->getStackWidth(MF);
1390
1391  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1392
1393  if (ValueVT.isVector()) {
1394    unsigned NumElemVT = ValueVT.getVectorNumElements();
1395    EVT ElemVT = ValueVT.getVectorElementType();
1396    SmallVector<SDValue, 4> Stores(NumElemVT);
1397
1398    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1399                                      "vector width in load");
1400
1401    for (unsigned i = 0; i < NumElemVT; ++i) {
1402      unsigned Channel, PtrIncr;
1403      getStackAddress(StackWidth, i, Channel, PtrIncr);
1404      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1405                        DAG.getConstant(PtrIncr, MVT::i32));
1406      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1407                                 Value, DAG.getConstant(i, MVT::i32));
1408
1409      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1410                              Chain, Elem, Ptr,
1411                              DAG.getTargetConstant(Channel, MVT::i32));
1412    }
1413     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1414   } else {
1415    if (ValueVT == MVT::i8) {
1416      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1417    }
1418    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1419    DAG.getTargetConstant(0, MVT::i32)); // Channel
1420  }
1421
1422  return Chain;
1423}
1424
1425// return (512 + (kc_bank << 12)
1426static int
1427ConstantAddressBlock(unsigned AddressSpace) {
1428  switch (AddressSpace) {
1429  case AMDGPUAS::CONSTANT_BUFFER_0:
1430    return 512;
1431  case AMDGPUAS::CONSTANT_BUFFER_1:
1432    return 512 + 4096;
1433  case AMDGPUAS::CONSTANT_BUFFER_2:
1434    return 512 + 4096 * 2;
1435  case AMDGPUAS::CONSTANT_BUFFER_3:
1436    return 512 + 4096 * 3;
1437  case AMDGPUAS::CONSTANT_BUFFER_4:
1438    return 512 + 4096 * 4;
1439  case AMDGPUAS::CONSTANT_BUFFER_5:
1440    return 512 + 4096 * 5;
1441  case AMDGPUAS::CONSTANT_BUFFER_6:
1442    return 512 + 4096 * 6;
1443  case AMDGPUAS::CONSTANT_BUFFER_7:
1444    return 512 + 4096 * 7;
1445  case AMDGPUAS::CONSTANT_BUFFER_8:
1446    return 512 + 4096 * 8;
1447  case AMDGPUAS::CONSTANT_BUFFER_9:
1448    return 512 + 4096 * 9;
1449  case AMDGPUAS::CONSTANT_BUFFER_10:
1450    return 512 + 4096 * 10;
1451  case AMDGPUAS::CONSTANT_BUFFER_11:
1452    return 512 + 4096 * 11;
1453  case AMDGPUAS::CONSTANT_BUFFER_12:
1454    return 512 + 4096 * 12;
1455  case AMDGPUAS::CONSTANT_BUFFER_13:
1456    return 512 + 4096 * 13;
1457  case AMDGPUAS::CONSTANT_BUFFER_14:
1458    return 512 + 4096 * 14;
1459  case AMDGPUAS::CONSTANT_BUFFER_15:
1460    return 512 + 4096 * 15;
1461  default:
1462    return -1;
1463  }
1464}
1465
1466SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1467{
1468  EVT VT = Op.getValueType();
1469  SDLoc DL(Op);
1470  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1471  SDValue Chain = Op.getOperand(0);
1472  SDValue Ptr = Op.getOperand(1);
1473  SDValue LoweredLoad;
1474
1475  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1476  if (Ret.getNode()) {
1477    SDValue Ops[2] = {
1478      Ret,
1479      Chain
1480    };
1481    return DAG.getMergeValues(Ops, DL);
1482  }
1483
1484  // Lower loads constant address space global variable loads
1485  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1486      isa<GlobalVariable>(GetUnderlyingObject(
1487          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1488
1489    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1490        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1491    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1492        DAG.getConstant(2, MVT::i32));
1493    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1494                       LoadNode->getChain(), Ptr,
1495                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1496  }
1497
1498  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1499    SDValue MergedValues[2] = {
1500      ScalarizeVectorLoad(Op, DAG),
1501      Chain
1502    };
1503    return DAG.getMergeValues(MergedValues, DL);
1504  }
1505
1506  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1507  if (ConstantBlock > -1 &&
1508      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1509       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1510    SDValue Result;
1511    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1512        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1513        isa<ConstantSDNode>(Ptr)) {
1514      SDValue Slots[4];
1515      for (unsigned i = 0; i < 4; i++) {
1516        // We want Const position encoded with the following formula :
1517        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1518        // const_index is Ptr computed by llvm using an alignment of 16.
1519        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1520        // then div by 4 at the ISel step
1521        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1522            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1523        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1524      }
1525      EVT NewVT = MVT::v4i32;
1526      unsigned NumElements = 4;
1527      if (VT.isVector()) {
1528        NewVT = VT;
1529        NumElements = VT.getVectorNumElements();
1530      }
1531      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1532                           makeArrayRef(Slots, NumElements));
1533    } else {
1534      // non-constant ptr can't be folded, keeps it as a v4f32 load
1535      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1536          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1537          DAG.getConstant(LoadNode->getAddressSpace() -
1538                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1539          );
1540    }
1541
1542    if (!VT.isVector()) {
1543      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1544          DAG.getConstant(0, MVT::i32));
1545    }
1546
1547    SDValue MergedValues[2] = {
1548      Result,
1549      Chain
1550    };
1551    return DAG.getMergeValues(MergedValues, DL);
1552  }
1553
1554  // For most operations returning SDValue() will result in the node being
1555  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1556  // need to manually expand loads that may be legal in some address spaces and
1557  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1558  // compute shaders, since the data is sign extended when it is uploaded to the
1559  // buffer. However SEXT loads from other address spaces are not supported, so
1560  // we need to expand them here.
1561  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1562    EVT MemVT = LoadNode->getMemoryVT();
1563    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1564    SDValue ShiftAmount =
1565          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1566    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1567                                  LoadNode->getPointerInfo(), MemVT,
1568                                  LoadNode->isVolatile(),
1569                                  LoadNode->isNonTemporal(),
1570                                  LoadNode->isInvariant(),
1571                                  LoadNode->getAlignment());
1572    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1573    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1574
1575    SDValue MergedValues[2] = { Sra, Chain };
1576    return DAG.getMergeValues(MergedValues, DL);
1577  }
1578
1579  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1580    return SDValue();
1581  }
1582
1583  // Lowering for indirect addressing
1584  const MachineFunction &MF = DAG.getMachineFunction();
1585  const AMDGPUFrameLowering *TFL =
1586      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1587  unsigned StackWidth = TFL->getStackWidth(MF);
1588
1589  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1590
1591  if (VT.isVector()) {
1592    unsigned NumElemVT = VT.getVectorNumElements();
1593    EVT ElemVT = VT.getVectorElementType();
1594    SDValue Loads[4];
1595
1596    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1597                                      "vector width in load");
1598
1599    for (unsigned i = 0; i < NumElemVT; ++i) {
1600      unsigned Channel, PtrIncr;
1601      getStackAddress(StackWidth, i, Channel, PtrIncr);
1602      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1603                        DAG.getConstant(PtrIncr, MVT::i32));
1604      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1605                             Chain, Ptr,
1606                             DAG.getTargetConstant(Channel, MVT::i32),
1607                             Op.getOperand(2));
1608    }
1609    for (unsigned i = NumElemVT; i < 4; ++i) {
1610      Loads[i] = DAG.getUNDEF(ElemVT);
1611    }
1612    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1613    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1614  } else {
1615    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1616                              Chain, Ptr,
1617                              DAG.getTargetConstant(0, MVT::i32), // Channel
1618                              Op.getOperand(2));
1619  }
1620
1621  SDValue Ops[2] = {
1622    LoweredLoad,
1623    Chain
1624  };
1625
1626  return DAG.getMergeValues(Ops, DL);
1627}
1628
1629SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1630  SDValue Chain = Op.getOperand(0);
1631  SDValue Cond  = Op.getOperand(1);
1632  SDValue Jump  = Op.getOperand(2);
1633
1634  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1635                     Chain, Jump, Cond);
1636}
1637
1638/// XXX Only kernel functions are supported, so we can assume for now that
1639/// every function is a kernel function, but in the future we should use
1640/// separate calling conventions for kernel and non-kernel functions.
1641SDValue R600TargetLowering::LowerFormalArguments(
1642                                      SDValue Chain,
1643                                      CallingConv::ID CallConv,
1644                                      bool isVarArg,
1645                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1646                                      SDLoc DL, SelectionDAG &DAG,
1647                                      SmallVectorImpl<SDValue> &InVals) const {
1648  SmallVector<CCValAssign, 16> ArgLocs;
1649  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1650                 *DAG.getContext());
1651  MachineFunction &MF = DAG.getMachineFunction();
1652  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1653
1654  SmallVector<ISD::InputArg, 8> LocalIns;
1655
1656  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1657
1658  AnalyzeFormalArguments(CCInfo, LocalIns);
1659
1660  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1661    CCValAssign &VA = ArgLocs[i];
1662    const ISD::InputArg &In = Ins[i];
1663    EVT VT = In.VT;
1664    EVT MemVT = VA.getLocVT();
1665    if (!VT.isVector() && MemVT.isVector()) {
1666      // Get load source type if scalarized.
1667      MemVT = MemVT.getVectorElementType();
1668    }
1669
1670    if (MFI->getShaderType() != ShaderType::COMPUTE) {
1671      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1672      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1673      InVals.push_back(Register);
1674      continue;
1675    }
1676
1677    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1678                                          AMDGPUAS::CONSTANT_BUFFER_0);
1679
1680    // i64 isn't a legal type, so the register type used ends up as i32, which
1681    // isn't expected here. It attempts to create this sextload, but it ends up
1682    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1683    // for <1 x i64>.
1684
1685    // The first 36 bytes of the input buffer contains information about
1686    // thread group and global sizes.
1687    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1688    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1689      // FIXME: This should really check the extload type, but the handling of
1690      // extload vector parameters seems to be broken.
1691
1692      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1693      Ext = ISD::SEXTLOAD;
1694    }
1695
1696    // Compute the offset from the value.
1697    // XXX - I think PartOffset should give you this, but it seems to give the
1698    // size of the register which isn't useful.
1699
1700    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1701    unsigned PartOffset = VA.getLocMemOffset();
1702    unsigned Offset = 36 + VA.getLocMemOffset();
1703
1704    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1705    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1706                              DAG.getConstant(Offset, MVT::i32),
1707                              DAG.getUNDEF(MVT::i32),
1708                              PtrInfo,
1709                              MemVT, false, true, true, 4);
1710
1711    // 4 is the preferred alignment for the CONSTANT memory space.
1712    InVals.push_back(Arg);
1713    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1714  }
1715  return Chain;
1716}
1717
1718EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1719   if (!VT.isVector())
1720     return MVT::i32;
1721   return VT.changeVectorElementTypeToInteger();
1722}
1723
1724static SDValue CompactSwizzlableVector(
1725  SelectionDAG &DAG, SDValue VectorEntry,
1726  DenseMap<unsigned, unsigned> &RemapSwizzle) {
1727  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1728  assert(RemapSwizzle.empty());
1729  SDValue NewBldVec[4] = {
1730    VectorEntry.getOperand(0),
1731    VectorEntry.getOperand(1),
1732    VectorEntry.getOperand(2),
1733    VectorEntry.getOperand(3)
1734  };
1735
1736  for (unsigned i = 0; i < 4; i++) {
1737    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1738      // We mask write here to teach later passes that the ith element of this
1739      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1740      // break false dependencies and additionnaly make assembly easier to read.
1741      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1742    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1743      if (C->isZero()) {
1744        RemapSwizzle[i] = 4; // SEL_0
1745        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1746      } else if (C->isExactlyValue(1.0)) {
1747        RemapSwizzle[i] = 5; // SEL_1
1748        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1749      }
1750    }
1751
1752    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1753      continue;
1754    for (unsigned j = 0; j < i; j++) {
1755      if (NewBldVec[i] == NewBldVec[j]) {
1756        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1757        RemapSwizzle[i] = j;
1758        break;
1759      }
1760    }
1761  }
1762
1763  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1764                     VectorEntry.getValueType(), NewBldVec);
1765}
1766
1767static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1768                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1769  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1770  assert(RemapSwizzle.empty());
1771  SDValue NewBldVec[4] = {
1772      VectorEntry.getOperand(0),
1773      VectorEntry.getOperand(1),
1774      VectorEntry.getOperand(2),
1775      VectorEntry.getOperand(3)
1776  };
1777  bool isUnmovable[4] = { false, false, false, false };
1778  for (unsigned i = 0; i < 4; i++) {
1779    RemapSwizzle[i] = i;
1780    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1781      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1782          ->getZExtValue();
1783      if (i == Idx)
1784        isUnmovable[Idx] = true;
1785    }
1786  }
1787
1788  for (unsigned i = 0; i < 4; i++) {
1789    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1790      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1791          ->getZExtValue();
1792      if (isUnmovable[Idx])
1793        continue;
1794      // Swap i and Idx
1795      std::swap(NewBldVec[Idx], NewBldVec[i]);
1796      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1797      break;
1798    }
1799  }
1800
1801  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1802                     VectorEntry.getValueType(), NewBldVec);
1803}
1804
1805
1806SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1807SDValue Swz[4], SelectionDAG &DAG) const {
1808  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1809  // Old -> New swizzle values
1810  DenseMap<unsigned, unsigned> SwizzleRemap;
1811
1812  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1813  for (unsigned i = 0; i < 4; i++) {
1814    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1815    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1816      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1817  }
1818
1819  SwizzleRemap.clear();
1820  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1821  for (unsigned i = 0; i < 4; i++) {
1822    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1823    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1824      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1825  }
1826
1827  return BuildVector;
1828}
1829
1830
1831//===----------------------------------------------------------------------===//
1832// Custom DAG Optimizations
1833//===----------------------------------------------------------------------===//
1834
1835SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1836                                              DAGCombinerInfo &DCI) const {
1837  SelectionDAG &DAG = DCI.DAG;
1838
1839  switch (N->getOpcode()) {
1840  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1841  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1842  case ISD::FP_ROUND: {
1843      SDValue Arg = N->getOperand(0);
1844      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1845        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1846                           Arg.getOperand(0));
1847      }
1848      break;
1849    }
1850
1851  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1852  // (i32 select_cc f32, f32, -1, 0 cc)
1853  //
1854  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1855  // this to one of the SET*_DX10 instructions.
1856  case ISD::FP_TO_SINT: {
1857    SDValue FNeg = N->getOperand(0);
1858    if (FNeg.getOpcode() != ISD::FNEG) {
1859      return SDValue();
1860    }
1861    SDValue SelectCC = FNeg.getOperand(0);
1862    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1863        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1864        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1865        !isHWTrueValue(SelectCC.getOperand(2)) ||
1866        !isHWFalseValue(SelectCC.getOperand(3))) {
1867      return SDValue();
1868    }
1869
1870    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1871                           SelectCC.getOperand(0), // LHS
1872                           SelectCC.getOperand(1), // RHS
1873                           DAG.getConstant(-1, MVT::i32), // True
1874                           DAG.getConstant(0, MVT::i32),  // False
1875                           SelectCC.getOperand(4)); // CC
1876
1877    break;
1878  }
1879
1880  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1881  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1882  case ISD::INSERT_VECTOR_ELT: {
1883    SDValue InVec = N->getOperand(0);
1884    SDValue InVal = N->getOperand(1);
1885    SDValue EltNo = N->getOperand(2);
1886    SDLoc dl(N);
1887
1888    // If the inserted element is an UNDEF, just use the input vector.
1889    if (InVal.getOpcode() == ISD::UNDEF)
1890      return InVec;
1891
1892    EVT VT = InVec.getValueType();
1893
1894    // If we can't generate a legal BUILD_VECTOR, exit
1895    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1896      return SDValue();
1897
1898    // Check that we know which element is being inserted
1899    if (!isa<ConstantSDNode>(EltNo))
1900      return SDValue();
1901    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1902
1903    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1904    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1905    // vector elements.
1906    SmallVector<SDValue, 8> Ops;
1907    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1908      Ops.append(InVec.getNode()->op_begin(),
1909                 InVec.getNode()->op_end());
1910    } else if (InVec.getOpcode() == ISD::UNDEF) {
1911      unsigned NElts = VT.getVectorNumElements();
1912      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1913    } else {
1914      return SDValue();
1915    }
1916
1917    // Insert the element
1918    if (Elt < Ops.size()) {
1919      // All the operands of BUILD_VECTOR must have the same type;
1920      // we enforce that here.
1921      EVT OpVT = Ops[0].getValueType();
1922      if (InVal.getValueType() != OpVT)
1923        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1924          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1925          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1926      Ops[Elt] = InVal;
1927    }
1928
1929    // Return the new vector
1930    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1931  }
1932
1933  // Extract_vec (Build_vector) generated by custom lowering
1934  // also needs to be customly combined
1935  case ISD::EXTRACT_VECTOR_ELT: {
1936    SDValue Arg = N->getOperand(0);
1937    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1938      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1939        unsigned Element = Const->getZExtValue();
1940        return Arg->getOperand(Element);
1941      }
1942    }
1943    if (Arg.getOpcode() == ISD::BITCAST &&
1944        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1945      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1946        unsigned Element = Const->getZExtValue();
1947        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1948            Arg->getOperand(0).getOperand(Element));
1949      }
1950    }
1951  }
1952
1953  case ISD::SELECT_CC: {
1954    // Try common optimizations
1955    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1956    if (Ret.getNode())
1957      return Ret;
1958
1959    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1960    //      selectcc x, y, a, b, inv(cc)
1961    //
1962    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1963    //      selectcc x, y, a, b, cc
1964    SDValue LHS = N->getOperand(0);
1965    if (LHS.getOpcode() != ISD::SELECT_CC) {
1966      return SDValue();
1967    }
1968
1969    SDValue RHS = N->getOperand(1);
1970    SDValue True = N->getOperand(2);
1971    SDValue False = N->getOperand(3);
1972    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1973
1974    if (LHS.getOperand(2).getNode() != True.getNode() ||
1975        LHS.getOperand(3).getNode() != False.getNode() ||
1976        RHS.getNode() != False.getNode()) {
1977      return SDValue();
1978    }
1979
1980    switch (NCC) {
1981    default: return SDValue();
1982    case ISD::SETNE: return LHS;
1983    case ISD::SETEQ: {
1984      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1985      LHSCC = ISD::getSetCCInverse(LHSCC,
1986                                  LHS.getOperand(0).getValueType().isInteger());
1987      if (DCI.isBeforeLegalizeOps() ||
1988          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1989        return DAG.getSelectCC(SDLoc(N),
1990                               LHS.getOperand(0),
1991                               LHS.getOperand(1),
1992                               LHS.getOperand(2),
1993                               LHS.getOperand(3),
1994                               LHSCC);
1995      break;
1996    }
1997    }
1998    return SDValue();
1999  }
2000
2001  case AMDGPUISD::EXPORT: {
2002    SDValue Arg = N->getOperand(1);
2003    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2004      break;
2005
2006    SDValue NewArgs[8] = {
2007      N->getOperand(0), // Chain
2008      SDValue(),
2009      N->getOperand(2), // ArrayBase
2010      N->getOperand(3), // Type
2011      N->getOperand(4), // SWZ_X
2012      N->getOperand(5), // SWZ_Y
2013      N->getOperand(6), // SWZ_Z
2014      N->getOperand(7) // SWZ_W
2015    };
2016    SDLoc DL(N);
2017    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2018    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2019  }
2020  case AMDGPUISD::TEXTURE_FETCH: {
2021    SDValue Arg = N->getOperand(1);
2022    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2023      break;
2024
2025    SDValue NewArgs[19] = {
2026      N->getOperand(0),
2027      N->getOperand(1),
2028      N->getOperand(2),
2029      N->getOperand(3),
2030      N->getOperand(4),
2031      N->getOperand(5),
2032      N->getOperand(6),
2033      N->getOperand(7),
2034      N->getOperand(8),
2035      N->getOperand(9),
2036      N->getOperand(10),
2037      N->getOperand(11),
2038      N->getOperand(12),
2039      N->getOperand(13),
2040      N->getOperand(14),
2041      N->getOperand(15),
2042      N->getOperand(16),
2043      N->getOperand(17),
2044      N->getOperand(18),
2045    };
2046    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2047    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2048        NewArgs);
2049  }
2050  }
2051
2052  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2053}
2054
2055static bool
2056FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2057            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2058  const R600InstrInfo *TII =
2059      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2060  if (!Src.isMachineOpcode())
2061    return false;
2062  switch (Src.getMachineOpcode()) {
2063  case AMDGPU::FNEG_R600:
2064    if (!Neg.getNode())
2065      return false;
2066    Src = Src.getOperand(0);
2067    Neg = DAG.getTargetConstant(1, MVT::i32);
2068    return true;
2069  case AMDGPU::FABS_R600:
2070    if (!Abs.getNode())
2071      return false;
2072    Src = Src.getOperand(0);
2073    Abs = DAG.getTargetConstant(1, MVT::i32);
2074    return true;
2075  case AMDGPU::CONST_COPY: {
2076    unsigned Opcode = ParentNode->getMachineOpcode();
2077    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2078
2079    if (!Sel.getNode())
2080      return false;
2081
2082    SDValue CstOffset = Src.getOperand(0);
2083    if (ParentNode->getValueType(0).isVector())
2084      return false;
2085
2086    // Gather constants values
2087    int SrcIndices[] = {
2088      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2089      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2090      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2091      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2092      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2093      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2094      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2095      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2096      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2097      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2098      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2099    };
2100    std::vector<unsigned> Consts;
2101    for (int OtherSrcIdx : SrcIndices) {
2102      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2103      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2104        continue;
2105      if (HasDst) {
2106        OtherSrcIdx--;
2107        OtherSelIdx--;
2108      }
2109      if (RegisterSDNode *Reg =
2110          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2111        if (Reg->getReg() == AMDGPU::ALU_CONST) {
2112          ConstantSDNode *Cst
2113            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2114          Consts.push_back(Cst->getZExtValue());
2115        }
2116      }
2117    }
2118
2119    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2120    Consts.push_back(Cst->getZExtValue());
2121    if (!TII->fitsConstReadLimitations(Consts)) {
2122      return false;
2123    }
2124
2125    Sel = CstOffset;
2126    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2127    return true;
2128  }
2129  case AMDGPU::MOV_IMM_I32:
2130  case AMDGPU::MOV_IMM_F32: {
2131    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2132    uint64_t ImmValue = 0;
2133
2134
2135    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2136      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2137      float FloatValue = FPC->getValueAPF().convertToFloat();
2138      if (FloatValue == 0.0) {
2139        ImmReg = AMDGPU::ZERO;
2140      } else if (FloatValue == 0.5) {
2141        ImmReg = AMDGPU::HALF;
2142      } else if (FloatValue == 1.0) {
2143        ImmReg = AMDGPU::ONE;
2144      } else {
2145        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2146      }
2147    } else {
2148      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2149      uint64_t Value = C->getZExtValue();
2150      if (Value == 0) {
2151        ImmReg = AMDGPU::ZERO;
2152      } else if (Value == 1) {
2153        ImmReg = AMDGPU::ONE_INT;
2154      } else {
2155        ImmValue = Value;
2156      }
2157    }
2158
2159    // Check that we aren't already using an immediate.
2160    // XXX: It's possible for an instruction to have more than one
2161    // immediate operand, but this is not supported yet.
2162    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2163      if (!Imm.getNode())
2164        return false;
2165      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2166      assert(C);
2167      if (C->getZExtValue())
2168        return false;
2169      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2170    }
2171    Src = DAG.getRegister(ImmReg, MVT::i32);
2172    return true;
2173  }
2174  default:
2175    return false;
2176  }
2177}
2178
2179
2180/// \brief Fold the instructions after selecting them
2181SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2182                                            SelectionDAG &DAG) const {
2183  const R600InstrInfo *TII =
2184      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2185  if (!Node->isMachineOpcode())
2186    return Node;
2187  unsigned Opcode = Node->getMachineOpcode();
2188  SDValue FakeOp;
2189
2190  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2191
2192  if (Opcode == AMDGPU::DOT_4) {
2193    int OperandIdx[] = {
2194      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2195      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2196      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2197      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2198      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2199      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2200      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2201      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2202        };
2203    int NegIdx[] = {
2204      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2205      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2206      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2207      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2208      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2209      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2210      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2211      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2212    };
2213    int AbsIdx[] = {
2214      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2215      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2216      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2217      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2218      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2219      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2220      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2221      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2222    };
2223    for (unsigned i = 0; i < 8; i++) {
2224      if (OperandIdx[i] < 0)
2225        return Node;
2226      SDValue &Src = Ops[OperandIdx[i] - 1];
2227      SDValue &Neg = Ops[NegIdx[i] - 1];
2228      SDValue &Abs = Ops[AbsIdx[i] - 1];
2229      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2230      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2231      if (HasDst)
2232        SelIdx--;
2233      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2234      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2235        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2236    }
2237  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2238    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2239      SDValue &Src = Ops[i];
2240      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2241        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2242    }
2243  } else if (Opcode == AMDGPU::CLAMP_R600) {
2244    SDValue Src = Node->getOperand(0);
2245    if (!Src.isMachineOpcode() ||
2246        !TII->hasInstrModifiers(Src.getMachineOpcode()))
2247      return Node;
2248    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2249        AMDGPU::OpName::clamp);
2250    if (ClampIdx < 0)
2251      return Node;
2252    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2253    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2254    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2255        Node->getVTList(), Ops);
2256  } else {
2257    if (!TII->hasInstrModifiers(Opcode))
2258      return Node;
2259    int OperandIdx[] = {
2260      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2261      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2262      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2263    };
2264    int NegIdx[] = {
2265      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2266      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2267      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2268    };
2269    int AbsIdx[] = {
2270      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2271      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2272      -1
2273    };
2274    for (unsigned i = 0; i < 3; i++) {
2275      if (OperandIdx[i] < 0)
2276        return Node;
2277      SDValue &Src = Ops[OperandIdx[i] - 1];
2278      SDValue &Neg = Ops[NegIdx[i] - 1];
2279      SDValue FakeAbs;
2280      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2281      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2282      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2283      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2284      if (HasDst) {
2285        SelIdx--;
2286        ImmIdx--;
2287      }
2288      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2289      SDValue &Imm = Ops[ImmIdx];
2290      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2291        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2292    }
2293  }
2294
2295  return Node;
2296}
2297