R600ISelLowering.cpp revision c3c169c8844db7f8934fbb3a411290dc3cdcb543
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineFrameInfo.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/MachineRegisterInfo.h"
22#include "llvm/CodeGen/SelectionDAG.h"
23#include "llvm/IR/Argument.h"
24#include "llvm/IR/Function.h"
25
26using namespace llvm;
27
28R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
29    AMDGPUTargetLowering(TM),
30    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
31  setOperationAction(ISD::MUL, MVT::i64, Expand);
32  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
34  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
35  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
36  computeRegisterProperties();
37
38  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
39  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
40  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
41  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
42
43  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
44  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
45  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
46  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
47  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
48  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
49  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
50  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
51  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
52
53  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
54  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
55
56  setOperationAction(ISD::FSUB, MVT::f32, Expand);
57
58  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
59  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
60  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
61  setOperationAction(ISD::FPOW, MVT::f32, Custom);
62
63  setOperationAction(ISD::ROTL, MVT::i32, Custom);
64
65  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
66  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
67
68  setOperationAction(ISD::SETCC, MVT::i32, Custom);
69  setOperationAction(ISD::SETCC, MVT::f32, Custom);
70  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
71
72  setOperationAction(ISD::SELECT, MVT::i32, Custom);
73  setOperationAction(ISD::SELECT, MVT::f32, Custom);
74
75  // Legalize loads and stores to the private address space.
76  setOperationAction(ISD::LOAD, MVT::i32, Custom);
77  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
78  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
79  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
80  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
81  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
82  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
83  setOperationAction(ISD::STORE, MVT::i8, Custom);
84  setOperationAction(ISD::STORE, MVT::i32, Custom);
85  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
86  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
87
88  setOperationAction(ISD::LOAD, MVT::i32, Custom);
89  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
90  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
91
92  setTargetDAGCombine(ISD::FP_ROUND);
93  setTargetDAGCombine(ISD::FP_TO_SINT);
94  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
95  setTargetDAGCombine(ISD::SELECT_CC);
96
97  setSchedulingPreference(Sched::VLIW);
98}
99
100MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
101    MachineInstr * MI, MachineBasicBlock * BB) const {
102  MachineFunction * MF = BB->getParent();
103  MachineRegisterInfo &MRI = MF->getRegInfo();
104  MachineBasicBlock::iterator I = *MI;
105
106  switch (MI->getOpcode()) {
107  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
108  case AMDGPU::CLAMP_R600: {
109    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
110                                                   AMDGPU::MOV,
111                                                   MI->getOperand(0).getReg(),
112                                                   MI->getOperand(1).getReg());
113    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
114    break;
115  }
116
117  case AMDGPU::FABS_R600: {
118    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
119                                                    AMDGPU::MOV,
120                                                    MI->getOperand(0).getReg(),
121                                                    MI->getOperand(1).getReg());
122    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
123    break;
124  }
125
126  case AMDGPU::FNEG_R600: {
127    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
128                                                    AMDGPU::MOV,
129                                                    MI->getOperand(0).getReg(),
130                                                    MI->getOperand(1).getReg());
131    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
132    break;
133  }
134
135  case AMDGPU::MASK_WRITE: {
136    unsigned maskedRegister = MI->getOperand(0).getReg();
137    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
138    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
139    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
140    break;
141  }
142
143  case AMDGPU::MOV_IMM_F32:
144    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
145                     MI->getOperand(1).getFPImm()->getValueAPF()
146                         .bitcastToAPInt().getZExtValue());
147    break;
148  case AMDGPU::MOV_IMM_I32:
149    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
150                     MI->getOperand(1).getImm());
151    break;
152  case AMDGPU::CONST_COPY: {
153    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
154        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
155    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
156        MI->getOperand(1).getImm());
157    break;
158  }
159
160  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
161  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
162    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
163
164    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
165            .addOperand(MI->getOperand(0))
166            .addOperand(MI->getOperand(1))
167            .addImm(EOP); // Set End of program bit
168    break;
169  }
170
171  case AMDGPU::TXD: {
172    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
173    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
174
175    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
176            .addOperand(MI->getOperand(3))
177            .addOperand(MI->getOperand(4))
178            .addOperand(MI->getOperand(5))
179            .addOperand(MI->getOperand(6));
180    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
181            .addOperand(MI->getOperand(2))
182            .addOperand(MI->getOperand(4))
183            .addOperand(MI->getOperand(5))
184            .addOperand(MI->getOperand(6));
185    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
186            .addOperand(MI->getOperand(0))
187            .addOperand(MI->getOperand(1))
188            .addOperand(MI->getOperand(4))
189            .addOperand(MI->getOperand(5))
190            .addOperand(MI->getOperand(6))
191            .addReg(T0, RegState::Implicit)
192            .addReg(T1, RegState::Implicit);
193    break;
194  }
195
196  case AMDGPU::TXD_SHADOW: {
197    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
198    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
199
200    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
201            .addOperand(MI->getOperand(3))
202            .addOperand(MI->getOperand(4))
203            .addOperand(MI->getOperand(5))
204            .addOperand(MI->getOperand(6));
205    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
206            .addOperand(MI->getOperand(2))
207            .addOperand(MI->getOperand(4))
208            .addOperand(MI->getOperand(5))
209            .addOperand(MI->getOperand(6));
210    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
211            .addOperand(MI->getOperand(0))
212            .addOperand(MI->getOperand(1))
213            .addOperand(MI->getOperand(4))
214            .addOperand(MI->getOperand(5))
215            .addOperand(MI->getOperand(6))
216            .addReg(T0, RegState::Implicit)
217            .addReg(T1, RegState::Implicit);
218    break;
219  }
220
221  case AMDGPU::BRANCH:
222      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
223              .addOperand(MI->getOperand(0))
224              .addReg(0);
225      break;
226
227  case AMDGPU::BRANCH_COND_f32: {
228    MachineInstr *NewMI =
229      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
230              AMDGPU::PREDICATE_BIT)
231              .addOperand(MI->getOperand(1))
232              .addImm(OPCODE_IS_NOT_ZERO)
233              .addImm(0); // Flags
234    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
235    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
236            .addOperand(MI->getOperand(0))
237            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
238    break;
239  }
240
241  case AMDGPU::BRANCH_COND_i32: {
242    MachineInstr *NewMI =
243      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
244            AMDGPU::PREDICATE_BIT)
245            .addOperand(MI->getOperand(1))
246            .addImm(OPCODE_IS_NOT_ZERO_INT)
247            .addImm(0); // Flags
248    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
249    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
250           .addOperand(MI->getOperand(0))
251            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
252    break;
253  }
254
255  case AMDGPU::EG_ExportSwz:
256  case AMDGPU::R600_ExportSwz: {
257    // Instruction is left unmodified if its not the last one of its type
258    bool isLastInstructionOfItsType = true;
259    unsigned InstExportType = MI->getOperand(1).getImm();
260    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
261         EndBlock = BB->end(); NextExportInst != EndBlock;
262         NextExportInst = llvm::next(NextExportInst)) {
263      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
264          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
265        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
266            .getImm();
267        if (CurrentInstExportType == InstExportType) {
268          isLastInstructionOfItsType = false;
269          break;
270        }
271      }
272    }
273    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
274    if (!EOP && !isLastInstructionOfItsType)
275      return BB;
276    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
277    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
278            .addOperand(MI->getOperand(0))
279            .addOperand(MI->getOperand(1))
280            .addOperand(MI->getOperand(2))
281            .addOperand(MI->getOperand(3))
282            .addOperand(MI->getOperand(4))
283            .addOperand(MI->getOperand(5))
284            .addOperand(MI->getOperand(6))
285            .addImm(CfInst)
286            .addImm(EOP);
287    break;
288  }
289  case AMDGPU::RETURN: {
290    // RETURN instructions must have the live-out registers as implicit uses,
291    // otherwise they appear dead.
292    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
293    MachineInstrBuilder MIB(*MF, MI);
294    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
295      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
296    return BB;
297  }
298  }
299
300  MI->eraseFromParent();
301  return BB;
302}
303
304//===----------------------------------------------------------------------===//
305// Custom DAG Lowering Operations
306//===----------------------------------------------------------------------===//
307
308using namespace llvm::Intrinsic;
309using namespace llvm::AMDGPUIntrinsic;
310
311SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
312  switch (Op.getOpcode()) {
313  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
314  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
315  case ISD::ROTL: return LowerROTL(Op, DAG);
316  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
317  case ISD::SELECT: return LowerSELECT(Op, DAG);
318  case ISD::SETCC: return LowerSETCC(Op, DAG);
319  case ISD::STORE: return LowerSTORE(Op, DAG);
320  case ISD::LOAD: return LowerLOAD(Op, DAG);
321  case ISD::FPOW: return LowerFPOW(Op, DAG);
322  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
323  case ISD::INTRINSIC_VOID: {
324    SDValue Chain = Op.getOperand(0);
325    unsigned IntrinsicID =
326                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
327    switch (IntrinsicID) {
328    case AMDGPUIntrinsic::AMDGPU_store_output: {
329      MachineFunction &MF = DAG.getMachineFunction();
330      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
331      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
332      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
333      MFI->LiveOuts.push_back(Reg);
334      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
335    }
336    case AMDGPUIntrinsic::R600_store_swizzle: {
337      const SDValue Args[8] = {
338        Chain,
339        Op.getOperand(2), // Export Value
340        Op.getOperand(3), // ArrayBase
341        Op.getOperand(4), // Type
342        DAG.getConstant(0, MVT::i32), // SWZ_X
343        DAG.getConstant(1, MVT::i32), // SWZ_Y
344        DAG.getConstant(2, MVT::i32), // SWZ_Z
345        DAG.getConstant(3, MVT::i32) // SWZ_W
346      };
347      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
348          Args, 8);
349    }
350
351    // default for switch(IntrinsicID)
352    default: break;
353    }
354    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
355    break;
356  }
357  case ISD::INTRINSIC_WO_CHAIN: {
358    unsigned IntrinsicID =
359                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
360    EVT VT = Op.getValueType();
361    DebugLoc DL = Op.getDebugLoc();
362    switch(IntrinsicID) {
363    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
364    case AMDGPUIntrinsic::R600_load_input: {
365      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
366      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
367      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
368    }
369
370    case AMDGPUIntrinsic::R600_interp_input: {
371      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
372      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
373      MachineSDNode *interp;
374      if (ijb < 0) {
375        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
376            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
377        return DAG.getTargetExtractSubreg(
378            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
379            DL, MVT::f32, SDValue(interp, 0));
380      }
381
382      if (slot % 4 < 2)
383        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
384            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
385            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
386                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
387            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
388                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
389      else
390        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
391            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
392            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
393                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
394            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
395                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
396
397      return SDValue(interp, slot % 2);
398    }
399
400    case r600_read_ngroups_x:
401      return LowerImplicitParameter(DAG, VT, DL, 0);
402    case r600_read_ngroups_y:
403      return LowerImplicitParameter(DAG, VT, DL, 1);
404    case r600_read_ngroups_z:
405      return LowerImplicitParameter(DAG, VT, DL, 2);
406    case r600_read_global_size_x:
407      return LowerImplicitParameter(DAG, VT, DL, 3);
408    case r600_read_global_size_y:
409      return LowerImplicitParameter(DAG, VT, DL, 4);
410    case r600_read_global_size_z:
411      return LowerImplicitParameter(DAG, VT, DL, 5);
412    case r600_read_local_size_x:
413      return LowerImplicitParameter(DAG, VT, DL, 6);
414    case r600_read_local_size_y:
415      return LowerImplicitParameter(DAG, VT, DL, 7);
416    case r600_read_local_size_z:
417      return LowerImplicitParameter(DAG, VT, DL, 8);
418
419    case r600_read_tgid_x:
420      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
421                                  AMDGPU::T1_X, VT);
422    case r600_read_tgid_y:
423      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
424                                  AMDGPU::T1_Y, VT);
425    case r600_read_tgid_z:
426      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
427                                  AMDGPU::T1_Z, VT);
428    case r600_read_tidig_x:
429      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
430                                  AMDGPU::T0_X, VT);
431    case r600_read_tidig_y:
432      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
433                                  AMDGPU::T0_Y, VT);
434    case r600_read_tidig_z:
435      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
436                                  AMDGPU::T0_Z, VT);
437    }
438    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
439    break;
440  }
441  } // end switch(Op.getOpcode())
442  return SDValue();
443}
444
445void R600TargetLowering::ReplaceNodeResults(SDNode *N,
446                                            SmallVectorImpl<SDValue> &Results,
447                                            SelectionDAG &DAG) const {
448  switch (N->getOpcode()) {
449  default: return;
450  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
451    return;
452  case ISD::LOAD: {
453    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
454    Results.push_back(SDValue(Node, 0));
455    Results.push_back(SDValue(Node, 1));
456    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
457    // function
458    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
459    return;
460  }
461  case ISD::STORE:
462    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
463    Results.push_back(SDValue(Node, 0));
464    return;
465  }
466}
467
468SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
469  return DAG.getNode(
470      ISD::SETCC,
471      Op.getDebugLoc(),
472      MVT::i1,
473      Op, DAG.getConstantFP(0.0f, MVT::f32),
474      DAG.getCondCode(ISD::SETNE)
475      );
476}
477
478SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
479  SDValue Chain = Op.getOperand(0);
480  SDValue CC = Op.getOperand(1);
481  SDValue LHS   = Op.getOperand(2);
482  SDValue RHS   = Op.getOperand(3);
483  SDValue JumpT  = Op.getOperand(4);
484  SDValue CmpValue;
485  SDValue Result;
486
487  if (LHS.getValueType() == MVT::i32) {
488    CmpValue = DAG.getNode(
489        ISD::SELECT_CC,
490        Op.getDebugLoc(),
491        MVT::i32,
492        LHS, RHS,
493        DAG.getConstant(-1, MVT::i32),
494        DAG.getConstant(0, MVT::i32),
495        CC);
496  } else if (LHS.getValueType() == MVT::f32) {
497    CmpValue = DAG.getNode(
498        ISD::SELECT_CC,
499        Op.getDebugLoc(),
500        MVT::f32,
501        LHS, RHS,
502        DAG.getConstantFP(1.0f, MVT::f32),
503        DAG.getConstantFP(0.0f, MVT::f32),
504        CC);
505  } else {
506    assert(0 && "Not valid type for br_cc");
507  }
508  Result = DAG.getNode(
509      AMDGPUISD::BRANCH_COND,
510      CmpValue.getDebugLoc(),
511      MVT::Other, Chain,
512      JumpT, CmpValue);
513  return Result;
514}
515
516SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
517                                                   DebugLoc DL,
518                                                   unsigned DwordOffset) const {
519  unsigned ByteOffset = DwordOffset * 4;
520  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
521                                      AMDGPUAS::PARAM_I_ADDRESS);
522
523  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
524  assert(isInt<16>(ByteOffset));
525
526  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
527                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
528                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
529                     false, false, false, 0);
530}
531
532SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
533
534  MachineFunction &MF = DAG.getMachineFunction();
535  const AMDGPUFrameLowering *TFL =
536   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
537
538  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
539  assert(FIN);
540
541  unsigned FrameIndex = FIN->getIndex();
542  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
543  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
544}
545
546SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
547  DebugLoc DL = Op.getDebugLoc();
548  EVT VT = Op.getValueType();
549
550  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
551                     Op.getOperand(0),
552                     Op.getOperand(0),
553                     DAG.getNode(ISD::SUB, DL, VT,
554                                 DAG.getConstant(32, MVT::i32),
555                                 Op.getOperand(1)));
556}
557
558bool R600TargetLowering::isZero(SDValue Op) const {
559  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
560    return Cst->isNullValue();
561  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
562    return CstFP->isZero();
563  } else {
564    return false;
565  }
566}
567
568SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
569  DebugLoc DL = Op.getDebugLoc();
570  EVT VT = Op.getValueType();
571
572  SDValue LHS = Op.getOperand(0);
573  SDValue RHS = Op.getOperand(1);
574  SDValue True = Op.getOperand(2);
575  SDValue False = Op.getOperand(3);
576  SDValue CC = Op.getOperand(4);
577  SDValue Temp;
578
579  // LHS and RHS are guaranteed to be the same value type
580  EVT CompareVT = LHS.getValueType();
581
582  // Check if we can lower this to a native operation.
583
584  // Try to lower to a CND* instruction:
585  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
586  // can be lowered to CND* instructions can also be lowered to SET*
587  // instructions.  CND* instructions are cheaper, because they dont't
588  // require additional instructions to convert their result to the correct
589  // value type, so this check should be first.
590  if (isZero(LHS) || isZero(RHS)) {
591    SDValue Cond = (isZero(LHS) ? RHS : LHS);
592    SDValue Zero = (isZero(LHS) ? LHS : RHS);
593    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
594    if (CompareVT != VT) {
595      // Bitcast True / False to the correct types.  This will end up being
596      // a nop, but it allows us to define only a single pattern in the
597      // .TD files for each CND* instruction rather than having to have
598      // one pattern for integer True/False and one for fp True/False
599      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
600      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
601    }
602    if (isZero(LHS)) {
603      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
604    }
605
606    switch (CCOpcode) {
607    case ISD::SETONE:
608    case ISD::SETUNE:
609    case ISD::SETNE:
610    case ISD::SETULE:
611    case ISD::SETULT:
612    case ISD::SETOLE:
613    case ISD::SETOLT:
614    case ISD::SETLE:
615    case ISD::SETLT:
616      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
617      Temp = True;
618      True = False;
619      False = Temp;
620      break;
621    default:
622      break;
623    }
624    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
625        Cond, Zero,
626        True, False,
627        DAG.getCondCode(CCOpcode));
628    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
629  }
630
631  // Try to lower to a SET* instruction:
632  //
633  // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware,
634  // but for the other case where CompareVT != VT, all operands of
635  // SELECT_CC need to have the same value type, so we need to change True and
636  // False to be the same type as LHS and RHS, and then convert the result of
637  // the select_cc back to the correct type.
638
639  // Move hardware True/False values to the correct operand.
640  if (isHWTrueValue(False) && isHWFalseValue(True)) {
641    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
642    std::swap(False, True);
643    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
644  }
645
646  if (isHWTrueValue(True) && isHWFalseValue(False)) {
647    if (CompareVT !=  VT && VT == MVT::f32 && CompareVT == MVT::i32) {
648      SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
649          LHS, RHS,
650          DAG.getConstant(-1, MVT::i32),
651          DAG.getConstant(0, MVT::i32),
652          CC);
653      // Convert integer values of true (-1) and false (0) to fp values of
654      // true (1.0f) and false (0.0f).
655      SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
656                                                DAG.getConstant(1, MVT::i32));
657      return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
658    } else {
659      // This SELECT_CC is already legal.
660      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
661    }
662  }
663
664  // Possible Min/Max pattern
665  SDValue MinMax = LowerMinMax(Op, DAG);
666  if (MinMax.getNode()) {
667    return MinMax;
668  }
669
670  // If we make it this for it means we have no native instructions to handle
671  // this SELECT_CC, so we must lower it.
672  SDValue HWTrue, HWFalse;
673
674  if (CompareVT == MVT::f32) {
675    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
676    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
677  } else if (CompareVT == MVT::i32) {
678    HWTrue = DAG.getConstant(-1, CompareVT);
679    HWFalse = DAG.getConstant(0, CompareVT);
680  }
681  else {
682    assert(!"Unhandled value type in LowerSELECT_CC");
683  }
684
685  // Lower this unsupported SELECT_CC into a combination of two supported
686  // SELECT_CC operations.
687  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
688
689  return DAG.getNode(ISD::SELECT_CC, DL, VT,
690      Cond, HWFalse,
691      True, False,
692      DAG.getCondCode(ISD::SETNE));
693}
694
695SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
696  return DAG.getNode(ISD::SELECT_CC,
697      Op.getDebugLoc(),
698      Op.getValueType(),
699      Op.getOperand(0),
700      DAG.getConstant(0, MVT::i32),
701      Op.getOperand(1),
702      Op.getOperand(2),
703      DAG.getCondCode(ISD::SETNE));
704}
705
706SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
707  SDValue Cond;
708  SDValue LHS = Op.getOperand(0);
709  SDValue RHS = Op.getOperand(1);
710  SDValue CC  = Op.getOperand(2);
711  DebugLoc DL = Op.getDebugLoc();
712  assert(Op.getValueType() == MVT::i32);
713  if (LHS.getValueType() == MVT::i32) {
714    Cond = DAG.getNode(
715        ISD::SELECT_CC,
716        Op.getDebugLoc(),
717        MVT::i32,
718        LHS, RHS,
719        DAG.getConstant(-1, MVT::i32),
720        DAG.getConstant(0, MVT::i32),
721        CC);
722  } else if (LHS.getValueType() == MVT::f32) {
723    Cond = DAG.getNode(
724        ISD::SELECT_CC,
725        Op.getDebugLoc(),
726        MVT::f32,
727        LHS, RHS,
728        DAG.getConstantFP(1.0f, MVT::f32),
729        DAG.getConstantFP(0.0f, MVT::f32),
730        CC);
731    Cond = DAG.getNode(
732        ISD::FP_TO_SINT,
733        DL,
734        MVT::i32,
735        Cond);
736  } else {
737    assert(0 && "Not valid type for set_cc");
738  }
739  Cond = DAG.getNode(
740      ISD::AND,
741      DL,
742      MVT::i32,
743      DAG.getConstant(1, MVT::i32),
744      Cond);
745  return Cond;
746}
747
748/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
749/// convert these pointers to a register index.  Each register holds
750/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
751/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
752/// for indirect addressing.
753SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
754                                               unsigned StackWidth,
755                                               SelectionDAG &DAG) const {
756  unsigned SRLPad;
757  switch(StackWidth) {
758  case 1:
759    SRLPad = 2;
760    break;
761  case 2:
762    SRLPad = 3;
763    break;
764  case 4:
765    SRLPad = 4;
766    break;
767  default: llvm_unreachable("Invalid stack width");
768  }
769
770  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
771                     DAG.getConstant(SRLPad, MVT::i32));
772}
773
774void R600TargetLowering::getStackAddress(unsigned StackWidth,
775                                         unsigned ElemIdx,
776                                         unsigned &Channel,
777                                         unsigned &PtrIncr) const {
778  switch (StackWidth) {
779  default:
780  case 1:
781    Channel = 0;
782    if (ElemIdx > 0) {
783      PtrIncr = 1;
784    } else {
785      PtrIncr = 0;
786    }
787    break;
788  case 2:
789    Channel = ElemIdx % 2;
790    if (ElemIdx == 2) {
791      PtrIncr = 1;
792    } else {
793      PtrIncr = 0;
794    }
795    break;
796  case 4:
797    Channel = ElemIdx;
798    PtrIncr = 0;
799    break;
800  }
801}
802
803SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
804  DebugLoc DL = Op.getDebugLoc();
805  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
806  SDValue Chain = Op.getOperand(0);
807  SDValue Value = Op.getOperand(1);
808  SDValue Ptr = Op.getOperand(2);
809
810  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
811      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
812    // Convert pointer from byte address to dword address.
813    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
814                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
815                                  Ptr, DAG.getConstant(2, MVT::i32)));
816
817    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
818      assert(!"Truncated and indexed stores not supported yet");
819    } else {
820      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
821    }
822    return Chain;
823  }
824
825  EVT ValueVT = Value.getValueType();
826
827  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
828    return SDValue();
829  }
830
831  // Lowering for indirect addressing
832
833  const MachineFunction &MF = DAG.getMachineFunction();
834  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
835                                         getTargetMachine().getFrameLowering());
836  unsigned StackWidth = TFL->getStackWidth(MF);
837
838  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
839
840  if (ValueVT.isVector()) {
841    unsigned NumElemVT = ValueVT.getVectorNumElements();
842    EVT ElemVT = ValueVT.getVectorElementType();
843    SDValue Stores[4];
844
845    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
846                                      "vector width in load");
847
848    for (unsigned i = 0; i < NumElemVT; ++i) {
849      unsigned Channel, PtrIncr;
850      getStackAddress(StackWidth, i, Channel, PtrIncr);
851      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
852                        DAG.getConstant(PtrIncr, MVT::i32));
853      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
854                                 Value, DAG.getConstant(i, MVT::i32));
855
856      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
857                              Chain, Elem, Ptr,
858                              DAG.getTargetConstant(Channel, MVT::i32));
859    }
860     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
861   } else {
862    if (ValueVT == MVT::i8) {
863      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
864    }
865    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
866    DAG.getTargetConstant(0, MVT::i32)); // Channel
867  }
868
869  return Chain;
870}
871
872// return (512 + (kc_bank << 12)
873static int
874ConstantAddressBlock(unsigned AddressSpace) {
875  switch (AddressSpace) {
876  case AMDGPUAS::CONSTANT_BUFFER_0:
877    return 512;
878  case AMDGPUAS::CONSTANT_BUFFER_1:
879    return 512 + 4096;
880  case AMDGPUAS::CONSTANT_BUFFER_2:
881    return 512 + 4096 * 2;
882  case AMDGPUAS::CONSTANT_BUFFER_3:
883    return 512 + 4096 * 3;
884  case AMDGPUAS::CONSTANT_BUFFER_4:
885    return 512 + 4096 * 4;
886  case AMDGPUAS::CONSTANT_BUFFER_5:
887    return 512 + 4096 * 5;
888  case AMDGPUAS::CONSTANT_BUFFER_6:
889    return 512 + 4096 * 6;
890  case AMDGPUAS::CONSTANT_BUFFER_7:
891    return 512 + 4096 * 7;
892  case AMDGPUAS::CONSTANT_BUFFER_8:
893    return 512 + 4096 * 8;
894  case AMDGPUAS::CONSTANT_BUFFER_9:
895    return 512 + 4096 * 9;
896  case AMDGPUAS::CONSTANT_BUFFER_10:
897    return 512 + 4096 * 10;
898  case AMDGPUAS::CONSTANT_BUFFER_11:
899    return 512 + 4096 * 11;
900  case AMDGPUAS::CONSTANT_BUFFER_12:
901    return 512 + 4096 * 12;
902  case AMDGPUAS::CONSTANT_BUFFER_13:
903    return 512 + 4096 * 13;
904  case AMDGPUAS::CONSTANT_BUFFER_14:
905    return 512 + 4096 * 14;
906  case AMDGPUAS::CONSTANT_BUFFER_15:
907    return 512 + 4096 * 15;
908  default:
909    return -1;
910  }
911}
912
913SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
914{
915  EVT VT = Op.getValueType();
916  DebugLoc DL = Op.getDebugLoc();
917  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
918  SDValue Chain = Op.getOperand(0);
919  SDValue Ptr = Op.getOperand(1);
920  SDValue LoweredLoad;
921
922  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
923  if (ConstantBlock > -1) {
924    SDValue Result;
925    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
926        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
927        dyn_cast<ConstantSDNode>(Ptr)) {
928      SDValue Slots[4];
929      for (unsigned i = 0; i < 4; i++) {
930        // We want Const position encoded with the following formula :
931        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
932        // const_index is Ptr computed by llvm using an alignment of 16.
933        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
934        // then div by 4 at the ISel step
935        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
936            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
937        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
938      }
939      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
940    } else {
941      // non constant ptr cant be folded, keeps it as a v4f32 load
942      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
943          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
944          DAG.getConstant(LoadNode->getAddressSpace() - 9, MVT::i32)
945          );
946    }
947
948    if (!VT.isVector()) {
949      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
950          DAG.getConstant(0, MVT::i32));
951    }
952
953    SDValue MergedValues[2] = {
954        Result,
955        Chain
956    };
957    return DAG.getMergeValues(MergedValues, 2, DL);
958  }
959
960  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
961    return SDValue();
962  }
963
964  // Lowering for indirect addressing
965  const MachineFunction &MF = DAG.getMachineFunction();
966  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
967                                         getTargetMachine().getFrameLowering());
968  unsigned StackWidth = TFL->getStackWidth(MF);
969
970  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
971
972  if (VT.isVector()) {
973    unsigned NumElemVT = VT.getVectorNumElements();
974    EVT ElemVT = VT.getVectorElementType();
975    SDValue Loads[4];
976
977    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
978                                      "vector width in load");
979
980    for (unsigned i = 0; i < NumElemVT; ++i) {
981      unsigned Channel, PtrIncr;
982      getStackAddress(StackWidth, i, Channel, PtrIncr);
983      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
984                        DAG.getConstant(PtrIncr, MVT::i32));
985      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
986                             Chain, Ptr,
987                             DAG.getTargetConstant(Channel, MVT::i32),
988                             Op.getOperand(2));
989    }
990    for (unsigned i = NumElemVT; i < 4; ++i) {
991      Loads[i] = DAG.getUNDEF(ElemVT);
992    }
993    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
994    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
995  } else {
996    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
997                              Chain, Ptr,
998                              DAG.getTargetConstant(0, MVT::i32), // Channel
999                              Op.getOperand(2));
1000  }
1001
1002  SDValue Ops[2];
1003  Ops[0] = LoweredLoad;
1004  Ops[1] = Chain;
1005
1006  return DAG.getMergeValues(Ops, 2, DL);
1007}
1008
1009SDValue R600TargetLowering::LowerFPOW(SDValue Op,
1010    SelectionDAG &DAG) const {
1011  DebugLoc DL = Op.getDebugLoc();
1012  EVT VT = Op.getValueType();
1013  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
1014  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
1015  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
1016}
1017
1018/// XXX Only kernel functions are supported, so we can assume for now that
1019/// every function is a kernel function, but in the future we should use
1020/// separate calling conventions for kernel and non-kernel functions.
1021SDValue R600TargetLowering::LowerFormalArguments(
1022                                      SDValue Chain,
1023                                      CallingConv::ID CallConv,
1024                                      bool isVarArg,
1025                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1026                                      DebugLoc DL, SelectionDAG &DAG,
1027                                      SmallVectorImpl<SDValue> &InVals) const {
1028  unsigned ParamOffsetBytes = 36;
1029  Function::const_arg_iterator FuncArg =
1030                            DAG.getMachineFunction().getFunction()->arg_begin();
1031  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1032    EVT VT = Ins[i].VT;
1033    Type *ArgType = FuncArg->getType();
1034    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1035                             32 : ArgType->getPrimitiveSizeInBits();
1036    unsigned ArgBytes = ArgSizeInBits >> 3;
1037    EVT ArgVT;
1038    if (ArgSizeInBits < VT.getSizeInBits()) {
1039      assert(!ArgType->isFloatTy() &&
1040             "Extending floating point arguments not supported yet");
1041      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1042    } else {
1043      ArgVT = VT;
1044    }
1045    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1046                                                    AMDGPUAS::PARAM_I_ADDRESS);
1047    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1048                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
1049                                       MachinePointerInfo(UndefValue::get(PtrTy)),
1050                                       ArgVT, false, false, ArgBytes);
1051    InVals.push_back(Arg);
1052    ParamOffsetBytes += ArgBytes;
1053  }
1054  return Chain;
1055}
1056
1057EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
1058   if (!VT.isVector()) return MVT::i32;
1059   return VT.changeVectorElementTypeToInteger();
1060}
1061
1062//===----------------------------------------------------------------------===//
1063// Custom DAG Optimizations
1064//===----------------------------------------------------------------------===//
1065
1066SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1067                                              DAGCombinerInfo &DCI) const {
1068  SelectionDAG &DAG = DCI.DAG;
1069
1070  switch (N->getOpcode()) {
1071  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1072  case ISD::FP_ROUND: {
1073      SDValue Arg = N->getOperand(0);
1074      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1075        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
1076                           Arg.getOperand(0));
1077      }
1078      break;
1079    }
1080
1081  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1082  // (i32 select_cc f32, f32, -1, 0 cc)
1083  //
1084  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1085  // this to one of the SET*_DX10 instructions.
1086  case ISD::FP_TO_SINT: {
1087    SDValue FNeg = N->getOperand(0);
1088    if (FNeg.getOpcode() != ISD::FNEG) {
1089      return SDValue();
1090    }
1091    SDValue SelectCC = FNeg.getOperand(0);
1092    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1093        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1094        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1095        !isHWTrueValue(SelectCC.getOperand(2)) ||
1096        !isHWFalseValue(SelectCC.getOperand(3))) {
1097      return SDValue();
1098    }
1099
1100    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1101                           SelectCC.getOperand(0), // LHS
1102                           SelectCC.getOperand(1), // RHS
1103                           DAG.getConstant(-1, MVT::i32), // True
1104                           DAG.getConstant(0, MVT::i32),  // Flase
1105                           SelectCC.getOperand(4)); // CC
1106
1107    break;
1108  }
1109  // Extract_vec (Build_vector) generated by custom lowering
1110  // also needs to be customly combined
1111  case ISD::EXTRACT_VECTOR_ELT: {
1112    SDValue Arg = N->getOperand(0);
1113    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1114      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1115        unsigned Element = Const->getZExtValue();
1116        return Arg->getOperand(Element);
1117      }
1118    }
1119    if (Arg.getOpcode() == ISD::BITCAST &&
1120        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1121      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1122        unsigned Element = Const->getZExtValue();
1123        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1124            Arg->getOperand(0).getOperand(Element));
1125      }
1126    }
1127  }
1128
1129  case ISD::SELECT_CC: {
1130    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1131    //      selectcc x, y, a, b, inv(cc)
1132    SDValue LHS = N->getOperand(0);
1133    if (LHS.getOpcode() != ISD::SELECT_CC) {
1134      return SDValue();
1135    }
1136
1137    SDValue RHS = N->getOperand(1);
1138    SDValue True = N->getOperand(2);
1139    SDValue False = N->getOperand(3);
1140
1141    if (LHS.getOperand(2).getNode() != True.getNode() ||
1142        LHS.getOperand(3).getNode() != False.getNode() ||
1143        RHS.getNode() != False.getNode() ||
1144        cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) {
1145      return SDValue();
1146    }
1147
1148    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get();
1149    CCOpcode = ISD::getSetCCInverse(
1150                        CCOpcode, LHS.getOperand(0).getValueType().isInteger());
1151    return DAG.getSelectCC(N->getDebugLoc(),
1152                           LHS.getOperand(0),
1153                           LHS.getOperand(1),
1154                           LHS.getOperand(2),
1155                           LHS.getOperand(3),
1156                           CCOpcode);
1157    }
1158  case AMDGPUISD::EXPORT: {
1159    SDValue Arg = N->getOperand(1);
1160    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1161      break;
1162    SDValue NewBldVec[4] = {
1163        DAG.getUNDEF(MVT::f32),
1164        DAG.getUNDEF(MVT::f32),
1165        DAG.getUNDEF(MVT::f32),
1166        DAG.getUNDEF(MVT::f32)
1167      };
1168    SDValue NewArgs[8] = {
1169      N->getOperand(0), // Chain
1170      SDValue(),
1171      N->getOperand(2), // ArrayBase
1172      N->getOperand(3), // Type
1173      N->getOperand(4), // SWZ_X
1174      N->getOperand(5), // SWZ_Y
1175      N->getOperand(6), // SWZ_Z
1176      N->getOperand(7) // SWZ_W
1177    };
1178    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1179      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1180        if (C->isZero()) {
1181          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1182        } else if (C->isExactlyValue(1.0)) {
1183          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1184        } else {
1185          NewBldVec[i] = Arg.getOperand(i);
1186        }
1187      } else {
1188        NewBldVec[i] = Arg.getOperand(i);
1189      }
1190    }
1191    DebugLoc DL = N->getDebugLoc();
1192    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1193    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1194  }
1195  }
1196  return SDValue();
1197}
1198