1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "AMDGPUFrameLowering.h"
17#include "AMDGPUIntrinsicInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "R600Defines.h"
20#include "R600InstrInfo.h"
21#include "R600MachineFunctionInfo.h"
22#include "llvm/CodeGen/CallingConvLower.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineRegisterInfo.h"
26#include "llvm/CodeGen/SelectionDAG.h"
27#include "llvm/IR/Argument.h"
28#include "llvm/IR/Function.h"
29
30using namespace llvm;
31
32R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
33    AMDGPUTargetLowering(TM),
34    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
35  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
36  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
38  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
39  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
40  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
41
42  computeRegisterProperties();
43
44  // Set condition code actions
45  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
46  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
47  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
48  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
49  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
50  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
54  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
55  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
56  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
57
58  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
59  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
60  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
61  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
62
63  setOperationAction(ISD::FCOS, MVT::f32, Custom);
64  setOperationAction(ISD::FSIN, MVT::f32, Custom);
65
66  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
67  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
68
69  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
70  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
71  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
72
73  setOperationAction(ISD::FSUB, MVT::f32, Expand);
74
75  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
76  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
77  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
78
79  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
80  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
81
82  setOperationAction(ISD::SETCC, MVT::i32, Expand);
83  setOperationAction(ISD::SETCC, MVT::f32, Expand);
84  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
85
86  setOperationAction(ISD::SELECT, MVT::i32, Expand);
87  setOperationAction(ISD::SELECT, MVT::f32, Expand);
88  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
89  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
90
91  // Expand sign extension of vectors
92  if (!Subtarget->hasBFE())
93    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
94
95  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
96  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
97
98  if (!Subtarget->hasBFE())
99    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
100  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
101  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
102
103  if (!Subtarget->hasBFE())
104    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
105  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
106  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
107
108  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
109  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
110  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
111
112  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
113
114
115  // Legalize loads and stores to the private address space.
116  setOperationAction(ISD::LOAD, MVT::i32, Custom);
117  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
118  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
119
120  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
121  // spaces, so it is custom lowered to handle those where it isn't.
122  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
123  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
124  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
125  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
126  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
127  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
128
129  setOperationAction(ISD::STORE, MVT::i8, Custom);
130  setOperationAction(ISD::STORE, MVT::i32, Custom);
131  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
132  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
133  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
134  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
135
136  setOperationAction(ISD::LOAD, MVT::i32, Custom);
137  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
138  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
139
140  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
141  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
142  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
143  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
144
145  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
146  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
147  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
148  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
149
150  setTargetDAGCombine(ISD::FP_ROUND);
151  setTargetDAGCombine(ISD::FP_TO_SINT);
152  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
153  setTargetDAGCombine(ISD::SELECT_CC);
154  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
155
156  setOperationAction(ISD::SUB, MVT::i64, Expand);
157
158  // These should be replaced by UDVIREM, but it does not happen automatically
159  // during Type Legalization
160  setOperationAction(ISD::UDIV, MVT::i64, Custom);
161  setOperationAction(ISD::UREM, MVT::i64, Custom);
162  setOperationAction(ISD::SDIV, MVT::i64, Custom);
163  setOperationAction(ISD::SREM, MVT::i64, Custom);
164
165  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
166  //  to be Legal/Custom in order to avoid library calls.
167  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
168  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
169  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
170
171  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
172
173  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
174  for (MVT VT : ScalarIntVTs) {
175    setOperationAction(ISD::ADDC, VT, Expand);
176    setOperationAction(ISD::SUBC, VT, Expand);
177    setOperationAction(ISD::ADDE, VT, Expand);
178    setOperationAction(ISD::SUBE, VT, Expand);
179  }
180
181  setBooleanContents(ZeroOrNegativeOneBooleanContent);
182  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
183  setSchedulingPreference(Sched::Source);
184}
185
186MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
187    MachineInstr * MI, MachineBasicBlock * BB) const {
188  MachineFunction * MF = BB->getParent();
189  MachineRegisterInfo &MRI = MF->getRegInfo();
190  MachineBasicBlock::iterator I = *MI;
191  const R600InstrInfo *TII =
192    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
193
194  switch (MI->getOpcode()) {
195  default:
196    // Replace LDS_*_RET instruction that don't have any uses with the
197    // equivalent LDS_*_NORET instruction.
198    if (TII->isLDSRetInstr(MI->getOpcode())) {
199      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
200      assert(DstIdx != -1);
201      MachineInstrBuilder NewMI;
202      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
203        return BB;
204
205      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
206                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
207      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
208        NewMI.addOperand(MI->getOperand(i));
209      }
210    } else {
211      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
212    }
213    break;
214  case AMDGPU::CLAMP_R600: {
215    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
216                                                   AMDGPU::MOV,
217                                                   MI->getOperand(0).getReg(),
218                                                   MI->getOperand(1).getReg());
219    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
220    break;
221  }
222
223  case AMDGPU::FABS_R600: {
224    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
225                                                    AMDGPU::MOV,
226                                                    MI->getOperand(0).getReg(),
227                                                    MI->getOperand(1).getReg());
228    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
229    break;
230  }
231
232  case AMDGPU::FNEG_R600: {
233    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
234                                                    AMDGPU::MOV,
235                                                    MI->getOperand(0).getReg(),
236                                                    MI->getOperand(1).getReg());
237    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
238    break;
239  }
240
241  case AMDGPU::MASK_WRITE: {
242    unsigned maskedRegister = MI->getOperand(0).getReg();
243    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
244    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
245    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
246    break;
247  }
248
249  case AMDGPU::MOV_IMM_F32:
250    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
251                     MI->getOperand(1).getFPImm()->getValueAPF()
252                         .bitcastToAPInt().getZExtValue());
253    break;
254  case AMDGPU::MOV_IMM_I32:
255    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
256                     MI->getOperand(1).getImm());
257    break;
258  case AMDGPU::CONST_COPY: {
259    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
260        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
261    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
262        MI->getOperand(1).getImm());
263    break;
264  }
265
266  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
267  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
268  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
269    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
270
271    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
272            .addOperand(MI->getOperand(0))
273            .addOperand(MI->getOperand(1))
274            .addImm(EOP); // Set End of program bit
275    break;
276  }
277
278  case AMDGPU::TXD: {
279    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
280    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
281    MachineOperand &RID = MI->getOperand(4);
282    MachineOperand &SID = MI->getOperand(5);
283    unsigned TextureId = MI->getOperand(6).getImm();
284    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
285    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
286
287    switch (TextureId) {
288    case 5: // Rect
289      CTX = CTY = 0;
290      break;
291    case 6: // Shadow1D
292      SrcW = SrcZ;
293      break;
294    case 7: // Shadow2D
295      SrcW = SrcZ;
296      break;
297    case 8: // ShadowRect
298      CTX = CTY = 0;
299      SrcW = SrcZ;
300      break;
301    case 9: // 1DArray
302      SrcZ = SrcY;
303      CTZ = 0;
304      break;
305    case 10: // 2DArray
306      CTZ = 0;
307      break;
308    case 11: // Shadow1DArray
309      SrcZ = SrcY;
310      CTZ = 0;
311      break;
312    case 12: // Shadow2DArray
313      CTZ = 0;
314      break;
315    }
316    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
317            .addOperand(MI->getOperand(3))
318            .addImm(SrcX)
319            .addImm(SrcY)
320            .addImm(SrcZ)
321            .addImm(SrcW)
322            .addImm(0)
323            .addImm(0)
324            .addImm(0)
325            .addImm(0)
326            .addImm(1)
327            .addImm(2)
328            .addImm(3)
329            .addOperand(RID)
330            .addOperand(SID)
331            .addImm(CTX)
332            .addImm(CTY)
333            .addImm(CTZ)
334            .addImm(CTW);
335    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
336            .addOperand(MI->getOperand(2))
337            .addImm(SrcX)
338            .addImm(SrcY)
339            .addImm(SrcZ)
340            .addImm(SrcW)
341            .addImm(0)
342            .addImm(0)
343            .addImm(0)
344            .addImm(0)
345            .addImm(1)
346            .addImm(2)
347            .addImm(3)
348            .addOperand(RID)
349            .addOperand(SID)
350            .addImm(CTX)
351            .addImm(CTY)
352            .addImm(CTZ)
353            .addImm(CTW);
354    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
355            .addOperand(MI->getOperand(0))
356            .addOperand(MI->getOperand(1))
357            .addImm(SrcX)
358            .addImm(SrcY)
359            .addImm(SrcZ)
360            .addImm(SrcW)
361            .addImm(0)
362            .addImm(0)
363            .addImm(0)
364            .addImm(0)
365            .addImm(1)
366            .addImm(2)
367            .addImm(3)
368            .addOperand(RID)
369            .addOperand(SID)
370            .addImm(CTX)
371            .addImm(CTY)
372            .addImm(CTZ)
373            .addImm(CTW)
374            .addReg(T0, RegState::Implicit)
375            .addReg(T1, RegState::Implicit);
376    break;
377  }
378
379  case AMDGPU::TXD_SHADOW: {
380    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
381    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
382    MachineOperand &RID = MI->getOperand(4);
383    MachineOperand &SID = MI->getOperand(5);
384    unsigned TextureId = MI->getOperand(6).getImm();
385    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
386    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
387
388    switch (TextureId) {
389    case 5: // Rect
390      CTX = CTY = 0;
391      break;
392    case 6: // Shadow1D
393      SrcW = SrcZ;
394      break;
395    case 7: // Shadow2D
396      SrcW = SrcZ;
397      break;
398    case 8: // ShadowRect
399      CTX = CTY = 0;
400      SrcW = SrcZ;
401      break;
402    case 9: // 1DArray
403      SrcZ = SrcY;
404      CTZ = 0;
405      break;
406    case 10: // 2DArray
407      CTZ = 0;
408      break;
409    case 11: // Shadow1DArray
410      SrcZ = SrcY;
411      CTZ = 0;
412      break;
413    case 12: // Shadow2DArray
414      CTZ = 0;
415      break;
416    }
417
418    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
419            .addOperand(MI->getOperand(3))
420            .addImm(SrcX)
421            .addImm(SrcY)
422            .addImm(SrcZ)
423            .addImm(SrcW)
424            .addImm(0)
425            .addImm(0)
426            .addImm(0)
427            .addImm(0)
428            .addImm(1)
429            .addImm(2)
430            .addImm(3)
431            .addOperand(RID)
432            .addOperand(SID)
433            .addImm(CTX)
434            .addImm(CTY)
435            .addImm(CTZ)
436            .addImm(CTW);
437    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
438            .addOperand(MI->getOperand(2))
439            .addImm(SrcX)
440            .addImm(SrcY)
441            .addImm(SrcZ)
442            .addImm(SrcW)
443            .addImm(0)
444            .addImm(0)
445            .addImm(0)
446            .addImm(0)
447            .addImm(1)
448            .addImm(2)
449            .addImm(3)
450            .addOperand(RID)
451            .addOperand(SID)
452            .addImm(CTX)
453            .addImm(CTY)
454            .addImm(CTZ)
455            .addImm(CTW);
456    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
457            .addOperand(MI->getOperand(0))
458            .addOperand(MI->getOperand(1))
459            .addImm(SrcX)
460            .addImm(SrcY)
461            .addImm(SrcZ)
462            .addImm(SrcW)
463            .addImm(0)
464            .addImm(0)
465            .addImm(0)
466            .addImm(0)
467            .addImm(1)
468            .addImm(2)
469            .addImm(3)
470            .addOperand(RID)
471            .addOperand(SID)
472            .addImm(CTX)
473            .addImm(CTY)
474            .addImm(CTZ)
475            .addImm(CTW)
476            .addReg(T0, RegState::Implicit)
477            .addReg(T1, RegState::Implicit);
478    break;
479  }
480
481  case AMDGPU::BRANCH:
482      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
483              .addOperand(MI->getOperand(0));
484      break;
485
486  case AMDGPU::BRANCH_COND_f32: {
487    MachineInstr *NewMI =
488      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
489              AMDGPU::PREDICATE_BIT)
490              .addOperand(MI->getOperand(1))
491              .addImm(OPCODE_IS_NOT_ZERO)
492              .addImm(0); // Flags
493    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
494    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
495            .addOperand(MI->getOperand(0))
496            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
497    break;
498  }
499
500  case AMDGPU::BRANCH_COND_i32: {
501    MachineInstr *NewMI =
502      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
503            AMDGPU::PREDICATE_BIT)
504            .addOperand(MI->getOperand(1))
505            .addImm(OPCODE_IS_NOT_ZERO_INT)
506            .addImm(0); // Flags
507    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
508    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
509           .addOperand(MI->getOperand(0))
510            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
511    break;
512  }
513
514  case AMDGPU::EG_ExportSwz:
515  case AMDGPU::R600_ExportSwz: {
516    // Instruction is left unmodified if its not the last one of its type
517    bool isLastInstructionOfItsType = true;
518    unsigned InstExportType = MI->getOperand(1).getImm();
519    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
520         EndBlock = BB->end(); NextExportInst != EndBlock;
521         NextExportInst = std::next(NextExportInst)) {
522      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
523          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
524        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
525            .getImm();
526        if (CurrentInstExportType == InstExportType) {
527          isLastInstructionOfItsType = false;
528          break;
529        }
530      }
531    }
532    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
533    if (!EOP && !isLastInstructionOfItsType)
534      return BB;
535    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
536    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
537            .addOperand(MI->getOperand(0))
538            .addOperand(MI->getOperand(1))
539            .addOperand(MI->getOperand(2))
540            .addOperand(MI->getOperand(3))
541            .addOperand(MI->getOperand(4))
542            .addOperand(MI->getOperand(5))
543            .addOperand(MI->getOperand(6))
544            .addImm(CfInst)
545            .addImm(EOP);
546    break;
547  }
548  case AMDGPU::RETURN: {
549    // RETURN instructions must have the live-out registers as implicit uses,
550    // otherwise they appear dead.
551    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
552    MachineInstrBuilder MIB(*MF, MI);
553    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
554      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
555    return BB;
556  }
557  }
558
559  MI->eraseFromParent();
560  return BB;
561}
562
563//===----------------------------------------------------------------------===//
564// Custom DAG Lowering Operations
565//===----------------------------------------------------------------------===//
566
567SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
568  MachineFunction &MF = DAG.getMachineFunction();
569  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
570  switch (Op.getOpcode()) {
571  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
572  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
573  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
574  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
575  case ISD::SRA_PARTS:
576  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
577  case ISD::FCOS:
578  case ISD::FSIN: return LowerTrig(Op, DAG);
579  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
580  case ISD::STORE: return LowerSTORE(Op, DAG);
581  case ISD::LOAD: {
582    SDValue Result = LowerLOAD(Op, DAG);
583    assert((!Result.getNode() ||
584            Result.getNode()->getNumValues() == 2) &&
585           "Load should return a value and a chain");
586    return Result;
587  }
588
589  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
590  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
591  case ISD::INTRINSIC_VOID: {
592    SDValue Chain = Op.getOperand(0);
593    unsigned IntrinsicID =
594                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
595    switch (IntrinsicID) {
596    case AMDGPUIntrinsic::AMDGPU_store_output: {
597      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
598      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
599      MFI->LiveOuts.push_back(Reg);
600      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
601    }
602    case AMDGPUIntrinsic::R600_store_swizzle: {
603      const SDValue Args[8] = {
604        Chain,
605        Op.getOperand(2), // Export Value
606        Op.getOperand(3), // ArrayBase
607        Op.getOperand(4), // Type
608        DAG.getConstant(0, MVT::i32), // SWZ_X
609        DAG.getConstant(1, MVT::i32), // SWZ_Y
610        DAG.getConstant(2, MVT::i32), // SWZ_Z
611        DAG.getConstant(3, MVT::i32) // SWZ_W
612      };
613      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
614    }
615
616    // default for switch(IntrinsicID)
617    default: break;
618    }
619    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
620    break;
621  }
622  case ISD::INTRINSIC_WO_CHAIN: {
623    unsigned IntrinsicID =
624                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
625    EVT VT = Op.getValueType();
626    SDLoc DL(Op);
627    switch(IntrinsicID) {
628    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
629    case AMDGPUIntrinsic::R600_load_input: {
630      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
631      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
632      MachineFunction &MF = DAG.getMachineFunction();
633      MachineRegisterInfo &MRI = MF.getRegInfo();
634      MRI.addLiveIn(Reg);
635      return DAG.getCopyFromReg(DAG.getEntryNode(),
636          SDLoc(DAG.getEntryNode()), Reg, VT);
637    }
638
639    case AMDGPUIntrinsic::R600_interp_input: {
640      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
641      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
642      MachineSDNode *interp;
643      if (ijb < 0) {
644        const MachineFunction &MF = DAG.getMachineFunction();
645        const R600InstrInfo *TII =
646          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
647        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
648            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
649        return DAG.getTargetExtractSubreg(
650            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
651            DL, MVT::f32, SDValue(interp, 0));
652      }
653      MachineFunction &MF = DAG.getMachineFunction();
654      MachineRegisterInfo &MRI = MF.getRegInfo();
655      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
656      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
657      MRI.addLiveIn(RegisterI);
658      MRI.addLiveIn(RegisterJ);
659      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
660          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
661      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
662          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
663
664      if (slot % 4 < 2)
665        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
666            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
667            RegisterJNode, RegisterINode);
668      else
669        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
670            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
671            RegisterJNode, RegisterINode);
672      return SDValue(interp, slot % 2);
673    }
674    case AMDGPUIntrinsic::R600_interp_xy:
675    case AMDGPUIntrinsic::R600_interp_zw: {
676      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
677      MachineSDNode *interp;
678      SDValue RegisterINode = Op.getOperand(2);
679      SDValue RegisterJNode = Op.getOperand(3);
680
681      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
682        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
683            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
684            RegisterJNode, RegisterINode);
685      else
686        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
687            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
688            RegisterJNode, RegisterINode);
689      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
690          SDValue(interp, 0), SDValue(interp, 1));
691    }
692    case AMDGPUIntrinsic::R600_tex:
693    case AMDGPUIntrinsic::R600_texc:
694    case AMDGPUIntrinsic::R600_txl:
695    case AMDGPUIntrinsic::R600_txlc:
696    case AMDGPUIntrinsic::R600_txb:
697    case AMDGPUIntrinsic::R600_txbc:
698    case AMDGPUIntrinsic::R600_txf:
699    case AMDGPUIntrinsic::R600_txq:
700    case AMDGPUIntrinsic::R600_ddx:
701    case AMDGPUIntrinsic::R600_ddy:
702    case AMDGPUIntrinsic::R600_ldptr: {
703      unsigned TextureOp;
704      switch (IntrinsicID) {
705      case AMDGPUIntrinsic::R600_tex:
706        TextureOp = 0;
707        break;
708      case AMDGPUIntrinsic::R600_texc:
709        TextureOp = 1;
710        break;
711      case AMDGPUIntrinsic::R600_txl:
712        TextureOp = 2;
713        break;
714      case AMDGPUIntrinsic::R600_txlc:
715        TextureOp = 3;
716        break;
717      case AMDGPUIntrinsic::R600_txb:
718        TextureOp = 4;
719        break;
720      case AMDGPUIntrinsic::R600_txbc:
721        TextureOp = 5;
722        break;
723      case AMDGPUIntrinsic::R600_txf:
724        TextureOp = 6;
725        break;
726      case AMDGPUIntrinsic::R600_txq:
727        TextureOp = 7;
728        break;
729      case AMDGPUIntrinsic::R600_ddx:
730        TextureOp = 8;
731        break;
732      case AMDGPUIntrinsic::R600_ddy:
733        TextureOp = 9;
734        break;
735      case AMDGPUIntrinsic::R600_ldptr:
736        TextureOp = 10;
737        break;
738      default:
739        llvm_unreachable("Unknow Texture Operation");
740      }
741
742      SDValue TexArgs[19] = {
743        DAG.getConstant(TextureOp, MVT::i32),
744        Op.getOperand(1),
745        DAG.getConstant(0, MVT::i32),
746        DAG.getConstant(1, MVT::i32),
747        DAG.getConstant(2, MVT::i32),
748        DAG.getConstant(3, MVT::i32),
749        Op.getOperand(2),
750        Op.getOperand(3),
751        Op.getOperand(4),
752        DAG.getConstant(0, MVT::i32),
753        DAG.getConstant(1, MVT::i32),
754        DAG.getConstant(2, MVT::i32),
755        DAG.getConstant(3, MVT::i32),
756        Op.getOperand(5),
757        Op.getOperand(6),
758        Op.getOperand(7),
759        Op.getOperand(8),
760        Op.getOperand(9),
761        Op.getOperand(10)
762      };
763      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
764    }
765    case AMDGPUIntrinsic::AMDGPU_dp4: {
766      SDValue Args[8] = {
767      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
768          DAG.getConstant(0, MVT::i32)),
769      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
770          DAG.getConstant(0, MVT::i32)),
771      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
772          DAG.getConstant(1, MVT::i32)),
773      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
774          DAG.getConstant(1, MVT::i32)),
775      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
776          DAG.getConstant(2, MVT::i32)),
777      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
778          DAG.getConstant(2, MVT::i32)),
779      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
780          DAG.getConstant(3, MVT::i32)),
781      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
782          DAG.getConstant(3, MVT::i32))
783      };
784      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
785    }
786
787    case Intrinsic::r600_read_ngroups_x:
788      return LowerImplicitParameter(DAG, VT, DL, 0);
789    case Intrinsic::r600_read_ngroups_y:
790      return LowerImplicitParameter(DAG, VT, DL, 1);
791    case Intrinsic::r600_read_ngroups_z:
792      return LowerImplicitParameter(DAG, VT, DL, 2);
793    case Intrinsic::r600_read_global_size_x:
794      return LowerImplicitParameter(DAG, VT, DL, 3);
795    case Intrinsic::r600_read_global_size_y:
796      return LowerImplicitParameter(DAG, VT, DL, 4);
797    case Intrinsic::r600_read_global_size_z:
798      return LowerImplicitParameter(DAG, VT, DL, 5);
799    case Intrinsic::r600_read_local_size_x:
800      return LowerImplicitParameter(DAG, VT, DL, 6);
801    case Intrinsic::r600_read_local_size_y:
802      return LowerImplicitParameter(DAG, VT, DL, 7);
803    case Intrinsic::r600_read_local_size_z:
804      return LowerImplicitParameter(DAG, VT, DL, 8);
805
806    case Intrinsic::r600_read_tgid_x:
807      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
808                                  AMDGPU::T1_X, VT);
809    case Intrinsic::r600_read_tgid_y:
810      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
811                                  AMDGPU::T1_Y, VT);
812    case Intrinsic::r600_read_tgid_z:
813      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
814                                  AMDGPU::T1_Z, VT);
815    case Intrinsic::r600_read_tidig_x:
816      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
817                                  AMDGPU::T0_X, VT);
818    case Intrinsic::r600_read_tidig_y:
819      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
820                                  AMDGPU::T0_Y, VT);
821    case Intrinsic::r600_read_tidig_z:
822      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
823                                  AMDGPU::T0_Z, VT);
824    case Intrinsic::AMDGPU_rsq:
825      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
826      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
827    }
828    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
829    break;
830  }
831  } // end switch(Op.getOpcode())
832  return SDValue();
833}
834
835void R600TargetLowering::ReplaceNodeResults(SDNode *N,
836                                            SmallVectorImpl<SDValue> &Results,
837                                            SelectionDAG &DAG) const {
838  switch (N->getOpcode()) {
839  default:
840    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
841    return;
842  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
843    return;
844  case ISD::UDIV: {
845    SDValue Op = SDValue(N, 0);
846    SDLoc DL(Op);
847    EVT VT = Op.getValueType();
848    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
849      N->getOperand(0), N->getOperand(1));
850    Results.push_back(UDIVREM);
851    break;
852  }
853  case ISD::UREM: {
854    SDValue Op = SDValue(N, 0);
855    SDLoc DL(Op);
856    EVT VT = Op.getValueType();
857    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
858      N->getOperand(0), N->getOperand(1));
859    Results.push_back(UDIVREM.getValue(1));
860    break;
861  }
862  case ISD::SDIV: {
863    SDValue Op = SDValue(N, 0);
864    SDLoc DL(Op);
865    EVT VT = Op.getValueType();
866    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
867      N->getOperand(0), N->getOperand(1));
868    Results.push_back(SDIVREM);
869    break;
870  }
871  case ISD::SREM: {
872    SDValue Op = SDValue(N, 0);
873    SDLoc DL(Op);
874    EVT VT = Op.getValueType();
875    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
876      N->getOperand(0), N->getOperand(1));
877    Results.push_back(SDIVREM.getValue(1));
878    break;
879  }
880  case ISD::SDIVREM: {
881    SDValue Op = SDValue(N, 1);
882    SDValue RES = LowerSDIVREM(Op, DAG);
883    Results.push_back(RES);
884    Results.push_back(RES.getValue(1));
885    break;
886  }
887  case ISD::UDIVREM: {
888    SDValue Op = SDValue(N, 0);
889    SDLoc DL(Op);
890    EVT VT = Op.getValueType();
891    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
892
893    SDValue one = DAG.getConstant(1, HalfVT);
894    SDValue zero = DAG.getConstant(0, HalfVT);
895
896    //HiLo split
897    SDValue LHS = N->getOperand(0);
898    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
899    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
900
901    SDValue RHS = N->getOperand(1);
902    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
903    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
904
905    // Get Speculative values
906    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
907    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
908
909    SDValue REM_Hi = zero;
910    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
911
912    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
913    SDValue DIV_Lo = zero;
914
915    const unsigned halfBitWidth = HalfVT.getSizeInBits();
916
917    for (unsigned i = 0; i < halfBitWidth; ++i) {
918      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
919      // Get Value of high bit
920      SDValue HBit;
921      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
922        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
923      } else {
924        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
925        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
926      }
927
928      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
929        DAG.getConstant(halfBitWidth - 1, HalfVT));
930      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
931      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
932
933      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
934      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
935
936
937      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
938
939      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
940      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
941
942      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
943
944      // Update REM
945
946      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
947
948      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
949      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
950      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
951    }
952
953    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
954    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
955    Results.push_back(DIV);
956    Results.push_back(REM);
957    break;
958  }
959  }
960}
961
962SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
963                                                   SDValue Vector) const {
964
965  SDLoc DL(Vector);
966  EVT VecVT = Vector.getValueType();
967  EVT EltVT = VecVT.getVectorElementType();
968  SmallVector<SDValue, 8> Args;
969
970  for (unsigned i = 0, e = VecVT.getVectorNumElements();
971                                                           i != e; ++i) {
972    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
973                               Vector, DAG.getConstant(i, getVectorIdxTy())));
974  }
975
976  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
977}
978
979SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
980                                                    SelectionDAG &DAG) const {
981
982  SDLoc DL(Op);
983  SDValue Vector = Op.getOperand(0);
984  SDValue Index = Op.getOperand(1);
985
986  if (isa<ConstantSDNode>(Index) ||
987      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
988    return Op;
989
990  Vector = vectorToVerticalVector(DAG, Vector);
991  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
992                     Vector, Index);
993}
994
995SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
996                                                   SelectionDAG &DAG) const {
997  SDLoc DL(Op);
998  SDValue Vector = Op.getOperand(0);
999  SDValue Value = Op.getOperand(1);
1000  SDValue Index = Op.getOperand(2);
1001
1002  if (isa<ConstantSDNode>(Index) ||
1003      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1004    return Op;
1005
1006  Vector = vectorToVerticalVector(DAG, Vector);
1007  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1008                               Vector, Value, Index);
1009  return vectorToVerticalVector(DAG, Insert);
1010}
1011
1012SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1013  // On hw >= R700, COS/SIN input must be between -1. and 1.
1014  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1015  EVT VT = Op.getValueType();
1016  SDValue Arg = Op.getOperand(0);
1017  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1018      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1019        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1020          DAG.getConstantFP(0.15915494309, MVT::f32)),
1021        DAG.getConstantFP(0.5, MVT::f32)));
1022  unsigned TrigNode;
1023  switch (Op.getOpcode()) {
1024  case ISD::FCOS:
1025    TrigNode = AMDGPUISD::COS_HW;
1026    break;
1027  case ISD::FSIN:
1028    TrigNode = AMDGPUISD::SIN_HW;
1029    break;
1030  default:
1031    llvm_unreachable("Wrong trig opcode");
1032  }
1033  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1034      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1035        DAG.getConstantFP(-0.5, MVT::f32)));
1036  if (Gen >= AMDGPUSubtarget::R700)
1037    return TrigVal;
1038  // On R600 hw, COS/SIN input must be between -Pi and Pi.
1039  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1040      DAG.getConstantFP(3.14159265359, MVT::f32));
1041}
1042
1043SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1044  SDLoc DL(Op);
1045  EVT VT = Op.getValueType();
1046
1047  SDValue Lo = Op.getOperand(0);
1048  SDValue Hi = Op.getOperand(1);
1049  SDValue Shift = Op.getOperand(2);
1050  SDValue Zero = DAG.getConstant(0, VT);
1051  SDValue One  = DAG.getConstant(1, VT);
1052
1053  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1054  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1055  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1056  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1057
1058  // The dance around Width1 is necessary for 0 special case.
1059  // Without it the CompShift might be 32, producing incorrect results in
1060  // Overflow. So we do the shift in two steps, the alternative is to
1061  // add a conditional to filter the special case.
1062
1063  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1064  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1065
1066  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1067  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1068  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1069
1070  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1071  SDValue LoBig = Zero;
1072
1073  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1074  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1075
1076  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1077}
1078
1079SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1080  SDLoc DL(Op);
1081  EVT VT = Op.getValueType();
1082
1083  SDValue Lo = Op.getOperand(0);
1084  SDValue Hi = Op.getOperand(1);
1085  SDValue Shift = Op.getOperand(2);
1086  SDValue Zero = DAG.getConstant(0, VT);
1087  SDValue One  = DAG.getConstant(1, VT);
1088
1089  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1090
1091  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1092  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1093  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1094  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1095
1096  // The dance around Width1 is necessary for 0 special case.
1097  // Without it the CompShift might be 32, producing incorrect results in
1098  // Overflow. So we do the shift in two steps, the alternative is to
1099  // add a conditional to filter the special case.
1100
1101  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1102  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1103
1104  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1105  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1106  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1107
1108  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1109  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1110
1111  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1112  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1113
1114  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1115}
1116
1117SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1118  return DAG.getNode(
1119      ISD::SETCC,
1120      SDLoc(Op),
1121      MVT::i1,
1122      Op, DAG.getConstantFP(0.0f, MVT::f32),
1123      DAG.getCondCode(ISD::SETNE)
1124      );
1125}
1126
1127SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1128                                                   SDLoc DL,
1129                                                   unsigned DwordOffset) const {
1130  unsigned ByteOffset = DwordOffset * 4;
1131  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1132                                      AMDGPUAS::CONSTANT_BUFFER_0);
1133
1134  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1135  assert(isInt<16>(ByteOffset));
1136
1137  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1138                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
1139                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1140                     false, false, false, 0);
1141}
1142
1143bool R600TargetLowering::isZero(SDValue Op) const {
1144  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1145    return Cst->isNullValue();
1146  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1147    return CstFP->isZero();
1148  } else {
1149    return false;
1150  }
1151}
1152
1153SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1154  SDLoc DL(Op);
1155  EVT VT = Op.getValueType();
1156
1157  SDValue LHS = Op.getOperand(0);
1158  SDValue RHS = Op.getOperand(1);
1159  SDValue True = Op.getOperand(2);
1160  SDValue False = Op.getOperand(3);
1161  SDValue CC = Op.getOperand(4);
1162  SDValue Temp;
1163
1164  // LHS and RHS are guaranteed to be the same value type
1165  EVT CompareVT = LHS.getValueType();
1166
1167  // Check if we can lower this to a native operation.
1168
1169  // Try to lower to a SET* instruction:
1170  //
1171  // SET* can match the following patterns:
1172  //
1173  // select_cc f32, f32, -1,  0, cc_supported
1174  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1175  // select_cc i32, i32, -1,  0, cc_supported
1176  //
1177
1178  // Move hardware True/False values to the correct operand.
1179  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1180  ISD::CondCode InverseCC =
1181     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1182  if (isHWTrueValue(False) && isHWFalseValue(True)) {
1183    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1184      std::swap(False, True);
1185      CC = DAG.getCondCode(InverseCC);
1186    } else {
1187      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1188      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1189        std::swap(False, True);
1190        std::swap(LHS, RHS);
1191        CC = DAG.getCondCode(SwapInvCC);
1192      }
1193    }
1194  }
1195
1196  if (isHWTrueValue(True) && isHWFalseValue(False) &&
1197      (CompareVT == VT || VT == MVT::i32)) {
1198    // This can be matched by a SET* instruction.
1199    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1200  }
1201
1202  // Try to lower to a CND* instruction:
1203  //
1204  // CND* can match the following patterns:
1205  //
1206  // select_cc f32, 0.0, f32, f32, cc_supported
1207  // select_cc f32, 0.0, i32, i32, cc_supported
1208  // select_cc i32, 0,   f32, f32, cc_supported
1209  // select_cc i32, 0,   i32, i32, cc_supported
1210  //
1211
1212  // Try to move the zero value to the RHS
1213  if (isZero(LHS)) {
1214    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1215    // Try swapping the operands
1216    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1217    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1218      std::swap(LHS, RHS);
1219      CC = DAG.getCondCode(CCSwapped);
1220    } else {
1221      // Try inverting the conditon and then swapping the operands
1222      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1223      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1224      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1225        std::swap(True, False);
1226        std::swap(LHS, RHS);
1227        CC = DAG.getCondCode(CCSwapped);
1228      }
1229    }
1230  }
1231  if (isZero(RHS)) {
1232    SDValue Cond = LHS;
1233    SDValue Zero = RHS;
1234    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1235    if (CompareVT != VT) {
1236      // Bitcast True / False to the correct types.  This will end up being
1237      // a nop, but it allows us to define only a single pattern in the
1238      // .TD files for each CND* instruction rather than having to have
1239      // one pattern for integer True/False and one for fp True/False
1240      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1241      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1242    }
1243
1244    switch (CCOpcode) {
1245    case ISD::SETONE:
1246    case ISD::SETUNE:
1247    case ISD::SETNE:
1248      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1249      Temp = True;
1250      True = False;
1251      False = Temp;
1252      break;
1253    default:
1254      break;
1255    }
1256    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1257        Cond, Zero,
1258        True, False,
1259        DAG.getCondCode(CCOpcode));
1260    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1261  }
1262
1263  // If we make it this for it means we have no native instructions to handle
1264  // this SELECT_CC, so we must lower it.
1265  SDValue HWTrue, HWFalse;
1266
1267  if (CompareVT == MVT::f32) {
1268    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1269    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1270  } else if (CompareVT == MVT::i32) {
1271    HWTrue = DAG.getConstant(-1, CompareVT);
1272    HWFalse = DAG.getConstant(0, CompareVT);
1273  }
1274  else {
1275    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1276  }
1277
1278  // Lower this unsupported SELECT_CC into a combination of two supported
1279  // SELECT_CC operations.
1280  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1281
1282  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1283      Cond, HWFalse,
1284      True, False,
1285      DAG.getCondCode(ISD::SETNE));
1286}
1287
1288/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1289/// convert these pointers to a register index.  Each register holds
1290/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1291/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1292/// for indirect addressing.
1293SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1294                                               unsigned StackWidth,
1295                                               SelectionDAG &DAG) const {
1296  unsigned SRLPad;
1297  switch(StackWidth) {
1298  case 1:
1299    SRLPad = 2;
1300    break;
1301  case 2:
1302    SRLPad = 3;
1303    break;
1304  case 4:
1305    SRLPad = 4;
1306    break;
1307  default: llvm_unreachable("Invalid stack width");
1308  }
1309
1310  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1311                     DAG.getConstant(SRLPad, MVT::i32));
1312}
1313
1314void R600TargetLowering::getStackAddress(unsigned StackWidth,
1315                                         unsigned ElemIdx,
1316                                         unsigned &Channel,
1317                                         unsigned &PtrIncr) const {
1318  switch (StackWidth) {
1319  default:
1320  case 1:
1321    Channel = 0;
1322    if (ElemIdx > 0) {
1323      PtrIncr = 1;
1324    } else {
1325      PtrIncr = 0;
1326    }
1327    break;
1328  case 2:
1329    Channel = ElemIdx % 2;
1330    if (ElemIdx == 2) {
1331      PtrIncr = 1;
1332    } else {
1333      PtrIncr = 0;
1334    }
1335    break;
1336  case 4:
1337    Channel = ElemIdx;
1338    PtrIncr = 0;
1339    break;
1340  }
1341}
1342
1343SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1344  SDLoc DL(Op);
1345  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1346  SDValue Chain = Op.getOperand(0);
1347  SDValue Value = Op.getOperand(1);
1348  SDValue Ptr = Op.getOperand(2);
1349
1350  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1351  if (Result.getNode()) {
1352    return Result;
1353  }
1354
1355  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1356    if (StoreNode->isTruncatingStore()) {
1357      EVT VT = Value.getValueType();
1358      assert(VT.bitsLE(MVT::i32));
1359      EVT MemVT = StoreNode->getMemoryVT();
1360      SDValue MaskConstant;
1361      if (MemVT == MVT::i8) {
1362        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1363      } else {
1364        assert(MemVT == MVT::i16);
1365        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1366      }
1367      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1368                                      DAG.getConstant(2, MVT::i32));
1369      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1370                                      DAG.getConstant(0x00000003, VT));
1371      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1372      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1373                                   DAG.getConstant(3, VT));
1374      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1375      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1376      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1377      // vector instead.
1378      SDValue Src[4] = {
1379        ShiftedValue,
1380        DAG.getConstant(0, MVT::i32),
1381        DAG.getConstant(0, MVT::i32),
1382        Mask
1383      };
1384      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1385      SDValue Args[3] = { Chain, Input, DWordAddr };
1386      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1387                                     Op->getVTList(), Args, MemVT,
1388                                     StoreNode->getMemOperand());
1389    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1390               Value.getValueType().bitsGE(MVT::i32)) {
1391      // Convert pointer from byte address to dword address.
1392      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1393                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1394                                    Ptr, DAG.getConstant(2, MVT::i32)));
1395
1396      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1397        llvm_unreachable("Truncated and indexed stores not supported yet");
1398      } else {
1399        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1400      }
1401      return Chain;
1402    }
1403  }
1404
1405  EVT ValueVT = Value.getValueType();
1406
1407  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1408    return SDValue();
1409  }
1410
1411  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1412  if (Ret.getNode()) {
1413    return Ret;
1414  }
1415  // Lowering for indirect addressing
1416
1417  const MachineFunction &MF = DAG.getMachineFunction();
1418  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1419                                         getTargetMachine().getFrameLowering());
1420  unsigned StackWidth = TFL->getStackWidth(MF);
1421
1422  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1423
1424  if (ValueVT.isVector()) {
1425    unsigned NumElemVT = ValueVT.getVectorNumElements();
1426    EVT ElemVT = ValueVT.getVectorElementType();
1427    SmallVector<SDValue, 4> Stores(NumElemVT);
1428
1429    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1430                                      "vector width in load");
1431
1432    for (unsigned i = 0; i < NumElemVT; ++i) {
1433      unsigned Channel, PtrIncr;
1434      getStackAddress(StackWidth, i, Channel, PtrIncr);
1435      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1436                        DAG.getConstant(PtrIncr, MVT::i32));
1437      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1438                                 Value, DAG.getConstant(i, MVT::i32));
1439
1440      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1441                              Chain, Elem, Ptr,
1442                              DAG.getTargetConstant(Channel, MVT::i32));
1443    }
1444     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1445   } else {
1446    if (ValueVT == MVT::i8) {
1447      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1448    }
1449    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1450    DAG.getTargetConstant(0, MVT::i32)); // Channel
1451  }
1452
1453  return Chain;
1454}
1455
1456// return (512 + (kc_bank << 12)
1457static int
1458ConstantAddressBlock(unsigned AddressSpace) {
1459  switch (AddressSpace) {
1460  case AMDGPUAS::CONSTANT_BUFFER_0:
1461    return 512;
1462  case AMDGPUAS::CONSTANT_BUFFER_1:
1463    return 512 + 4096;
1464  case AMDGPUAS::CONSTANT_BUFFER_2:
1465    return 512 + 4096 * 2;
1466  case AMDGPUAS::CONSTANT_BUFFER_3:
1467    return 512 + 4096 * 3;
1468  case AMDGPUAS::CONSTANT_BUFFER_4:
1469    return 512 + 4096 * 4;
1470  case AMDGPUAS::CONSTANT_BUFFER_5:
1471    return 512 + 4096 * 5;
1472  case AMDGPUAS::CONSTANT_BUFFER_6:
1473    return 512 + 4096 * 6;
1474  case AMDGPUAS::CONSTANT_BUFFER_7:
1475    return 512 + 4096 * 7;
1476  case AMDGPUAS::CONSTANT_BUFFER_8:
1477    return 512 + 4096 * 8;
1478  case AMDGPUAS::CONSTANT_BUFFER_9:
1479    return 512 + 4096 * 9;
1480  case AMDGPUAS::CONSTANT_BUFFER_10:
1481    return 512 + 4096 * 10;
1482  case AMDGPUAS::CONSTANT_BUFFER_11:
1483    return 512 + 4096 * 11;
1484  case AMDGPUAS::CONSTANT_BUFFER_12:
1485    return 512 + 4096 * 12;
1486  case AMDGPUAS::CONSTANT_BUFFER_13:
1487    return 512 + 4096 * 13;
1488  case AMDGPUAS::CONSTANT_BUFFER_14:
1489    return 512 + 4096 * 14;
1490  case AMDGPUAS::CONSTANT_BUFFER_15:
1491    return 512 + 4096 * 15;
1492  default:
1493    return -1;
1494  }
1495}
1496
1497SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1498{
1499  EVT VT = Op.getValueType();
1500  SDLoc DL(Op);
1501  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1502  SDValue Chain = Op.getOperand(0);
1503  SDValue Ptr = Op.getOperand(1);
1504  SDValue LoweredLoad;
1505
1506  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1507  if (Ret.getNode()) {
1508    SDValue Ops[2] = {
1509      Ret,
1510      Chain
1511    };
1512    return DAG.getMergeValues(Ops, DL);
1513  }
1514
1515
1516  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1517    SDValue MergedValues[2] = {
1518      SplitVectorLoad(Op, DAG),
1519      Chain
1520    };
1521    return DAG.getMergeValues(MergedValues, DL);
1522  }
1523
1524  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1525  if (ConstantBlock > -1 &&
1526      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1527       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1528    SDValue Result;
1529    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1530        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1531        isa<ConstantSDNode>(Ptr)) {
1532      SDValue Slots[4];
1533      for (unsigned i = 0; i < 4; i++) {
1534        // We want Const position encoded with the following formula :
1535        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1536        // const_index is Ptr computed by llvm using an alignment of 16.
1537        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1538        // then div by 4 at the ISel step
1539        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1540            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1541        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1542      }
1543      EVT NewVT = MVT::v4i32;
1544      unsigned NumElements = 4;
1545      if (VT.isVector()) {
1546        NewVT = VT;
1547        NumElements = VT.getVectorNumElements();
1548      }
1549      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1550                           makeArrayRef(Slots, NumElements));
1551    } else {
1552      // non-constant ptr can't be folded, keeps it as a v4f32 load
1553      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1554          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1555          DAG.getConstant(LoadNode->getAddressSpace() -
1556                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1557          );
1558    }
1559
1560    if (!VT.isVector()) {
1561      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1562          DAG.getConstant(0, MVT::i32));
1563    }
1564
1565    SDValue MergedValues[2] = {
1566      Result,
1567      Chain
1568    };
1569    return DAG.getMergeValues(MergedValues, DL);
1570  }
1571
1572  // For most operations returning SDValue() will result in the node being
1573  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1574  // need to manually expand loads that may be legal in some address spaces and
1575  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1576  // compute shaders, since the data is sign extended when it is uploaded to the
1577  // buffer. However SEXT loads from other address spaces are not supported, so
1578  // we need to expand them here.
1579  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1580    EVT MemVT = LoadNode->getMemoryVT();
1581    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1582    SDValue ShiftAmount =
1583          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1584    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1585                                  LoadNode->getPointerInfo(), MemVT,
1586                                  LoadNode->isVolatile(),
1587                                  LoadNode->isNonTemporal(),
1588                                  LoadNode->getAlignment());
1589    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1590    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1591
1592    SDValue MergedValues[2] = { Sra, Chain };
1593    return DAG.getMergeValues(MergedValues, DL);
1594  }
1595
1596  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1597    return SDValue();
1598  }
1599
1600  // Lowering for indirect addressing
1601  const MachineFunction &MF = DAG.getMachineFunction();
1602  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1603                                         getTargetMachine().getFrameLowering());
1604  unsigned StackWidth = TFL->getStackWidth(MF);
1605
1606  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1607
1608  if (VT.isVector()) {
1609    unsigned NumElemVT = VT.getVectorNumElements();
1610    EVT ElemVT = VT.getVectorElementType();
1611    SDValue Loads[4];
1612
1613    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1614                                      "vector width in load");
1615
1616    for (unsigned i = 0; i < NumElemVT; ++i) {
1617      unsigned Channel, PtrIncr;
1618      getStackAddress(StackWidth, i, Channel, PtrIncr);
1619      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1620                        DAG.getConstant(PtrIncr, MVT::i32));
1621      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1622                             Chain, Ptr,
1623                             DAG.getTargetConstant(Channel, MVT::i32),
1624                             Op.getOperand(2));
1625    }
1626    for (unsigned i = NumElemVT; i < 4; ++i) {
1627      Loads[i] = DAG.getUNDEF(ElemVT);
1628    }
1629    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1630    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1631  } else {
1632    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1633                              Chain, Ptr,
1634                              DAG.getTargetConstant(0, MVT::i32), // Channel
1635                              Op.getOperand(2));
1636  }
1637
1638  SDValue Ops[2] = {
1639    LoweredLoad,
1640    Chain
1641  };
1642
1643  return DAG.getMergeValues(Ops, DL);
1644}
1645
1646SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1647  SDValue Chain = Op.getOperand(0);
1648  SDValue Cond  = Op.getOperand(1);
1649  SDValue Jump  = Op.getOperand(2);
1650
1651  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1652                     Chain, Jump, Cond);
1653}
1654
1655/// XXX Only kernel functions are supported, so we can assume for now that
1656/// every function is a kernel function, but in the future we should use
1657/// separate calling conventions for kernel and non-kernel functions.
1658SDValue R600TargetLowering::LowerFormalArguments(
1659                                      SDValue Chain,
1660                                      CallingConv::ID CallConv,
1661                                      bool isVarArg,
1662                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1663                                      SDLoc DL, SelectionDAG &DAG,
1664                                      SmallVectorImpl<SDValue> &InVals) const {
1665  SmallVector<CCValAssign, 16> ArgLocs;
1666  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1667                 getTargetMachine(), ArgLocs, *DAG.getContext());
1668  MachineFunction &MF = DAG.getMachineFunction();
1669  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1670
1671  SmallVector<ISD::InputArg, 8> LocalIns;
1672
1673  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1674
1675  AnalyzeFormalArguments(CCInfo, LocalIns);
1676
1677  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1678    CCValAssign &VA = ArgLocs[i];
1679    EVT VT = Ins[i].VT;
1680    EVT MemVT = LocalIns[i].VT;
1681
1682    if (ShaderType != ShaderType::COMPUTE) {
1683      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1684      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1685      InVals.push_back(Register);
1686      continue;
1687    }
1688
1689    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1690                                                   AMDGPUAS::CONSTANT_BUFFER_0);
1691
1692    // i64 isn't a legal type, so the register type used ends up as i32, which
1693    // isn't expected here. It attempts to create this sextload, but it ends up
1694    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1695    // for <1 x i64>.
1696
1697    // The first 36 bytes of the input buffer contains information about
1698    // thread group and global sizes.
1699
1700    // FIXME: This should really check the extload type, but the handling of
1701    // extload vecto parameters seems to be broken.
1702    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1703    ISD::LoadExtType Ext = ISD::SEXTLOAD;
1704    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1705                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1706                                 MachinePointerInfo(UndefValue::get(PtrTy)),
1707                                 MemVT, false, false, 4);
1708
1709    // 4 is the preferred alignment for the CONSTANT memory space.
1710    InVals.push_back(Arg);
1711  }
1712  return Chain;
1713}
1714
1715EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1716   if (!VT.isVector())
1717     return MVT::i32;
1718   return VT.changeVectorElementTypeToInteger();
1719}
1720
1721static SDValue CompactSwizzlableVector(
1722  SelectionDAG &DAG, SDValue VectorEntry,
1723  DenseMap<unsigned, unsigned> &RemapSwizzle) {
1724  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1725  assert(RemapSwizzle.empty());
1726  SDValue NewBldVec[4] = {
1727    VectorEntry.getOperand(0),
1728    VectorEntry.getOperand(1),
1729    VectorEntry.getOperand(2),
1730    VectorEntry.getOperand(3)
1731  };
1732
1733  for (unsigned i = 0; i < 4; i++) {
1734    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1735      // We mask write here to teach later passes that the ith element of this
1736      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1737      // break false dependencies and additionnaly make assembly easier to read.
1738      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1739    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1740      if (C->isZero()) {
1741        RemapSwizzle[i] = 4; // SEL_0
1742        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1743      } else if (C->isExactlyValue(1.0)) {
1744        RemapSwizzle[i] = 5; // SEL_1
1745        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1746      }
1747    }
1748
1749    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1750      continue;
1751    for (unsigned j = 0; j < i; j++) {
1752      if (NewBldVec[i] == NewBldVec[j]) {
1753        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1754        RemapSwizzle[i] = j;
1755        break;
1756      }
1757    }
1758  }
1759
1760  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1761                     VectorEntry.getValueType(), NewBldVec);
1762}
1763
1764static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1765                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1766  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1767  assert(RemapSwizzle.empty());
1768  SDValue NewBldVec[4] = {
1769      VectorEntry.getOperand(0),
1770      VectorEntry.getOperand(1),
1771      VectorEntry.getOperand(2),
1772      VectorEntry.getOperand(3)
1773  };
1774  bool isUnmovable[4] = { false, false, false, false };
1775  for (unsigned i = 0; i < 4; i++) {
1776    RemapSwizzle[i] = i;
1777    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1778      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1779          ->getZExtValue();
1780      if (i == Idx)
1781        isUnmovable[Idx] = true;
1782    }
1783  }
1784
1785  for (unsigned i = 0; i < 4; i++) {
1786    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1787      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1788          ->getZExtValue();
1789      if (isUnmovable[Idx])
1790        continue;
1791      // Swap i and Idx
1792      std::swap(NewBldVec[Idx], NewBldVec[i]);
1793      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1794      break;
1795    }
1796  }
1797
1798  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1799                     VectorEntry.getValueType(), NewBldVec);
1800}
1801
1802
1803SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1804SDValue Swz[4], SelectionDAG &DAG) const {
1805  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1806  // Old -> New swizzle values
1807  DenseMap<unsigned, unsigned> SwizzleRemap;
1808
1809  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1810  for (unsigned i = 0; i < 4; i++) {
1811    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1812    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1813      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1814  }
1815
1816  SwizzleRemap.clear();
1817  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1818  for (unsigned i = 0; i < 4; i++) {
1819    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1820    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1821      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1822  }
1823
1824  return BuildVector;
1825}
1826
1827
1828//===----------------------------------------------------------------------===//
1829// Custom DAG Optimizations
1830//===----------------------------------------------------------------------===//
1831
1832SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1833                                              DAGCombinerInfo &DCI) const {
1834  SelectionDAG &DAG = DCI.DAG;
1835
1836  switch (N->getOpcode()) {
1837  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1838  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1839  case ISD::FP_ROUND: {
1840      SDValue Arg = N->getOperand(0);
1841      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1842        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1843                           Arg.getOperand(0));
1844      }
1845      break;
1846    }
1847
1848  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1849  // (i32 select_cc f32, f32, -1, 0 cc)
1850  //
1851  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1852  // this to one of the SET*_DX10 instructions.
1853  case ISD::FP_TO_SINT: {
1854    SDValue FNeg = N->getOperand(0);
1855    if (FNeg.getOpcode() != ISD::FNEG) {
1856      return SDValue();
1857    }
1858    SDValue SelectCC = FNeg.getOperand(0);
1859    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1860        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1861        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1862        !isHWTrueValue(SelectCC.getOperand(2)) ||
1863        !isHWFalseValue(SelectCC.getOperand(3))) {
1864      return SDValue();
1865    }
1866
1867    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1868                           SelectCC.getOperand(0), // LHS
1869                           SelectCC.getOperand(1), // RHS
1870                           DAG.getConstant(-1, MVT::i32), // True
1871                           DAG.getConstant(0, MVT::i32),  // Flase
1872                           SelectCC.getOperand(4)); // CC
1873
1874    break;
1875  }
1876
1877  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1878  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1879  case ISD::INSERT_VECTOR_ELT: {
1880    SDValue InVec = N->getOperand(0);
1881    SDValue InVal = N->getOperand(1);
1882    SDValue EltNo = N->getOperand(2);
1883    SDLoc dl(N);
1884
1885    // If the inserted element is an UNDEF, just use the input vector.
1886    if (InVal.getOpcode() == ISD::UNDEF)
1887      return InVec;
1888
1889    EVT VT = InVec.getValueType();
1890
1891    // If we can't generate a legal BUILD_VECTOR, exit
1892    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1893      return SDValue();
1894
1895    // Check that we know which element is being inserted
1896    if (!isa<ConstantSDNode>(EltNo))
1897      return SDValue();
1898    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1899
1900    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1901    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1902    // vector elements.
1903    SmallVector<SDValue, 8> Ops;
1904    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1905      Ops.append(InVec.getNode()->op_begin(),
1906                 InVec.getNode()->op_end());
1907    } else if (InVec.getOpcode() == ISD::UNDEF) {
1908      unsigned NElts = VT.getVectorNumElements();
1909      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1910    } else {
1911      return SDValue();
1912    }
1913
1914    // Insert the element
1915    if (Elt < Ops.size()) {
1916      // All the operands of BUILD_VECTOR must have the same type;
1917      // we enforce that here.
1918      EVT OpVT = Ops[0].getValueType();
1919      if (InVal.getValueType() != OpVT)
1920        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1921          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1922          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1923      Ops[Elt] = InVal;
1924    }
1925
1926    // Return the new vector
1927    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1928  }
1929
1930  // Extract_vec (Build_vector) generated by custom lowering
1931  // also needs to be customly combined
1932  case ISD::EXTRACT_VECTOR_ELT: {
1933    SDValue Arg = N->getOperand(0);
1934    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1935      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1936        unsigned Element = Const->getZExtValue();
1937        return Arg->getOperand(Element);
1938      }
1939    }
1940    if (Arg.getOpcode() == ISD::BITCAST &&
1941        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1942      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1943        unsigned Element = Const->getZExtValue();
1944        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1945            Arg->getOperand(0).getOperand(Element));
1946      }
1947    }
1948  }
1949
1950  case ISD::SELECT_CC: {
1951    // Try common optimizations
1952    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1953    if (Ret.getNode())
1954      return Ret;
1955
1956    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1957    //      selectcc x, y, a, b, inv(cc)
1958    //
1959    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1960    //      selectcc x, y, a, b, cc
1961    SDValue LHS = N->getOperand(0);
1962    if (LHS.getOpcode() != ISD::SELECT_CC) {
1963      return SDValue();
1964    }
1965
1966    SDValue RHS = N->getOperand(1);
1967    SDValue True = N->getOperand(2);
1968    SDValue False = N->getOperand(3);
1969    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1970
1971    if (LHS.getOperand(2).getNode() != True.getNode() ||
1972        LHS.getOperand(3).getNode() != False.getNode() ||
1973        RHS.getNode() != False.getNode()) {
1974      return SDValue();
1975    }
1976
1977    switch (NCC) {
1978    default: return SDValue();
1979    case ISD::SETNE: return LHS;
1980    case ISD::SETEQ: {
1981      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1982      LHSCC = ISD::getSetCCInverse(LHSCC,
1983                                  LHS.getOperand(0).getValueType().isInteger());
1984      if (DCI.isBeforeLegalizeOps() ||
1985          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1986        return DAG.getSelectCC(SDLoc(N),
1987                               LHS.getOperand(0),
1988                               LHS.getOperand(1),
1989                               LHS.getOperand(2),
1990                               LHS.getOperand(3),
1991                               LHSCC);
1992      break;
1993    }
1994    }
1995    return SDValue();
1996  }
1997
1998  case AMDGPUISD::EXPORT: {
1999    SDValue Arg = N->getOperand(1);
2000    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2001      break;
2002
2003    SDValue NewArgs[8] = {
2004      N->getOperand(0), // Chain
2005      SDValue(),
2006      N->getOperand(2), // ArrayBase
2007      N->getOperand(3), // Type
2008      N->getOperand(4), // SWZ_X
2009      N->getOperand(5), // SWZ_Y
2010      N->getOperand(6), // SWZ_Z
2011      N->getOperand(7) // SWZ_W
2012    };
2013    SDLoc DL(N);
2014    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2015    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2016  }
2017  case AMDGPUISD::TEXTURE_FETCH: {
2018    SDValue Arg = N->getOperand(1);
2019    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2020      break;
2021
2022    SDValue NewArgs[19] = {
2023      N->getOperand(0),
2024      N->getOperand(1),
2025      N->getOperand(2),
2026      N->getOperand(3),
2027      N->getOperand(4),
2028      N->getOperand(5),
2029      N->getOperand(6),
2030      N->getOperand(7),
2031      N->getOperand(8),
2032      N->getOperand(9),
2033      N->getOperand(10),
2034      N->getOperand(11),
2035      N->getOperand(12),
2036      N->getOperand(13),
2037      N->getOperand(14),
2038      N->getOperand(15),
2039      N->getOperand(16),
2040      N->getOperand(17),
2041      N->getOperand(18),
2042    };
2043    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2044    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2045        NewArgs);
2046  }
2047  }
2048
2049  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2050}
2051
2052static bool
2053FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2054            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2055  const R600InstrInfo *TII =
2056      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2057  if (!Src.isMachineOpcode())
2058    return false;
2059  switch (Src.getMachineOpcode()) {
2060  case AMDGPU::FNEG_R600:
2061    if (!Neg.getNode())
2062      return false;
2063    Src = Src.getOperand(0);
2064    Neg = DAG.getTargetConstant(1, MVT::i32);
2065    return true;
2066  case AMDGPU::FABS_R600:
2067    if (!Abs.getNode())
2068      return false;
2069    Src = Src.getOperand(0);
2070    Abs = DAG.getTargetConstant(1, MVT::i32);
2071    return true;
2072  case AMDGPU::CONST_COPY: {
2073    unsigned Opcode = ParentNode->getMachineOpcode();
2074    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2075
2076    if (!Sel.getNode())
2077      return false;
2078
2079    SDValue CstOffset = Src.getOperand(0);
2080    if (ParentNode->getValueType(0).isVector())
2081      return false;
2082
2083    // Gather constants values
2084    int SrcIndices[] = {
2085      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2086      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2087      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2088      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2089      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2090      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2091      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2092      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2093      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2094      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2095      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2096    };
2097    std::vector<unsigned> Consts;
2098    for (int OtherSrcIdx : SrcIndices) {
2099      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2100      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2101        continue;
2102      if (HasDst) {
2103        OtherSrcIdx--;
2104        OtherSelIdx--;
2105      }
2106      if (RegisterSDNode *Reg =
2107          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2108        if (Reg->getReg() == AMDGPU::ALU_CONST) {
2109          ConstantSDNode *Cst
2110            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2111          Consts.push_back(Cst->getZExtValue());
2112        }
2113      }
2114    }
2115
2116    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2117    Consts.push_back(Cst->getZExtValue());
2118    if (!TII->fitsConstReadLimitations(Consts)) {
2119      return false;
2120    }
2121
2122    Sel = CstOffset;
2123    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2124    return true;
2125  }
2126  case AMDGPU::MOV_IMM_I32:
2127  case AMDGPU::MOV_IMM_F32: {
2128    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2129    uint64_t ImmValue = 0;
2130
2131
2132    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2133      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2134      float FloatValue = FPC->getValueAPF().convertToFloat();
2135      if (FloatValue == 0.0) {
2136        ImmReg = AMDGPU::ZERO;
2137      } else if (FloatValue == 0.5) {
2138        ImmReg = AMDGPU::HALF;
2139      } else if (FloatValue == 1.0) {
2140        ImmReg = AMDGPU::ONE;
2141      } else {
2142        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2143      }
2144    } else {
2145      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2146      uint64_t Value = C->getZExtValue();
2147      if (Value == 0) {
2148        ImmReg = AMDGPU::ZERO;
2149      } else if (Value == 1) {
2150        ImmReg = AMDGPU::ONE_INT;
2151      } else {
2152        ImmValue = Value;
2153      }
2154    }
2155
2156    // Check that we aren't already using an immediate.
2157    // XXX: It's possible for an instruction to have more than one
2158    // immediate operand, but this is not supported yet.
2159    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2160      if (!Imm.getNode())
2161        return false;
2162      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2163      assert(C);
2164      if (C->getZExtValue())
2165        return false;
2166      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2167    }
2168    Src = DAG.getRegister(ImmReg, MVT::i32);
2169    return true;
2170  }
2171  default:
2172    return false;
2173  }
2174}
2175
2176
2177/// \brief Fold the instructions after selecting them
2178SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2179                                            SelectionDAG &DAG) const {
2180  const R600InstrInfo *TII =
2181      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2182  if (!Node->isMachineOpcode())
2183    return Node;
2184  unsigned Opcode = Node->getMachineOpcode();
2185  SDValue FakeOp;
2186
2187  std::vector<SDValue> Ops;
2188  for (const SDUse &I : Node->ops())
2189    Ops.push_back(I);
2190
2191  if (Opcode == AMDGPU::DOT_4) {
2192    int OperandIdx[] = {
2193      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2194      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2195      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2196      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2197      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2198      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2199      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2200      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2201        };
2202    int NegIdx[] = {
2203      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2204      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2205      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2206      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2207      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2208      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2209      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2210      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2211    };
2212    int AbsIdx[] = {
2213      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2214      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2215      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2216      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2217      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2218      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2219      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2220      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2221    };
2222    for (unsigned i = 0; i < 8; i++) {
2223      if (OperandIdx[i] < 0)
2224        return Node;
2225      SDValue &Src = Ops[OperandIdx[i] - 1];
2226      SDValue &Neg = Ops[NegIdx[i] - 1];
2227      SDValue &Abs = Ops[AbsIdx[i] - 1];
2228      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2229      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2230      if (HasDst)
2231        SelIdx--;
2232      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2233      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2234        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2235    }
2236  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2237    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2238      SDValue &Src = Ops[i];
2239      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2240        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2241    }
2242  } else if (Opcode == AMDGPU::CLAMP_R600) {
2243    SDValue Src = Node->getOperand(0);
2244    if (!Src.isMachineOpcode() ||
2245        !TII->hasInstrModifiers(Src.getMachineOpcode()))
2246      return Node;
2247    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2248        AMDGPU::OpName::clamp);
2249    if (ClampIdx < 0)
2250      return Node;
2251    std::vector<SDValue> Ops;
2252    unsigned NumOp = Src.getNumOperands();
2253    for(unsigned i = 0; i < NumOp; ++i)
2254          Ops.push_back(Src.getOperand(i));
2255    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2256    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2257        Node->getVTList(), Ops);
2258  } else {
2259    if (!TII->hasInstrModifiers(Opcode))
2260      return Node;
2261    int OperandIdx[] = {
2262      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2263      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2264      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2265    };
2266    int NegIdx[] = {
2267      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2268      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2269      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2270    };
2271    int AbsIdx[] = {
2272      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2273      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2274      -1
2275    };
2276    for (unsigned i = 0; i < 3; i++) {
2277      if (OperandIdx[i] < 0)
2278        return Node;
2279      SDValue &Src = Ops[OperandIdx[i] - 1];
2280      SDValue &Neg = Ops[NegIdx[i] - 1];
2281      SDValue FakeAbs;
2282      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2283      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2286      if (HasDst) {
2287        SelIdx--;
2288        ImmIdx--;
2289      }
2290      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2291      SDValue &Imm = Ops[ImmIdx];
2292      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2293        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2294    }
2295  }
2296
2297  return Node;
2298}
2299