R600ISelLowering.cpp revision 29b15a378045762ce09642ab9dd741ece41f59a3
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineInstrBuilder.h"
20#include "llvm/CodeGen/MachineRegisterInfo.h"
21#include "llvm/CodeGen/SelectionDAG.h"
22#include "llvm/IR/Argument.h"
23#include "llvm/IR/Function.h"
24
25using namespace llvm;
26
27R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
28    AMDGPUTargetLowering(TM),
29    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
30  setOperationAction(ISD::MUL, MVT::i64, Expand);
31  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
32  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
33  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
34  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35  computeRegisterProperties();
36
37  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
38  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
39  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
40  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
41
42  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
43  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
44  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
45  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
46  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
47  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
48  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
49  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
50  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
51
52  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
53  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
54
55  setOperationAction(ISD::FSUB, MVT::f32, Expand);
56
57  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
58  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
59  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
60  setOperationAction(ISD::FPOW, MVT::f32, Custom);
61
62  setOperationAction(ISD::ROTL, MVT::i32, Custom);
63
64  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
65  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
66
67  setOperationAction(ISD::SETCC, MVT::i32, Custom);
68  setOperationAction(ISD::SETCC, MVT::f32, Custom);
69  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
70
71  setOperationAction(ISD::SELECT, MVT::i32, Custom);
72  setOperationAction(ISD::SELECT, MVT::f32, Custom);
73
74  setOperationAction(ISD::STORE, MVT::i32, Custom);
75  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
76
77  setOperationAction(ISD::LOAD, MVT::i32, Custom);
78  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
79  setTargetDAGCombine(ISD::FP_ROUND);
80  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
81
82  setSchedulingPreference(Sched::VLIW);
83}
84
85MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
86    MachineInstr * MI, MachineBasicBlock * BB) const {
87  MachineFunction * MF = BB->getParent();
88  MachineRegisterInfo &MRI = MF->getRegInfo();
89  MachineBasicBlock::iterator I = *MI;
90
91  switch (MI->getOpcode()) {
92  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
93  case AMDGPU::SHADER_TYPE: break;
94  case AMDGPU::CLAMP_R600: {
95    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
96                                                   AMDGPU::MOV,
97                                                   MI->getOperand(0).getReg(),
98                                                   MI->getOperand(1).getReg());
99    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
100    break;
101  }
102
103  case AMDGPU::FABS_R600: {
104    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
105                                                    AMDGPU::MOV,
106                                                    MI->getOperand(0).getReg(),
107                                                    MI->getOperand(1).getReg());
108    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
109    break;
110  }
111
112  case AMDGPU::FNEG_R600: {
113    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
114                                                    AMDGPU::MOV,
115                                                    MI->getOperand(0).getReg(),
116                                                    MI->getOperand(1).getReg());
117    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
118    break;
119  }
120
121  case AMDGPU::MASK_WRITE: {
122    unsigned maskedRegister = MI->getOperand(0).getReg();
123    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
124    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
125    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
126    break;
127  }
128
129  case AMDGPU::MOV_IMM_F32:
130    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
131                     MI->getOperand(1).getFPImm()->getValueAPF()
132                         .bitcastToAPInt().getZExtValue());
133    break;
134  case AMDGPU::MOV_IMM_I32:
135    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
136                     MI->getOperand(1).getImm());
137    break;
138
139
140  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
141  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
142    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
143
144    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
145            .addOperand(MI->getOperand(0))
146            .addOperand(MI->getOperand(1))
147            .addImm(EOP); // Set End of program bit
148    break;
149  }
150
151  case AMDGPU::TXD: {
152    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
153    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
154
155    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
156            .addOperand(MI->getOperand(3))
157            .addOperand(MI->getOperand(4))
158            .addOperand(MI->getOperand(5))
159            .addOperand(MI->getOperand(6));
160    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
161            .addOperand(MI->getOperand(2))
162            .addOperand(MI->getOperand(4))
163            .addOperand(MI->getOperand(5))
164            .addOperand(MI->getOperand(6));
165    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
166            .addOperand(MI->getOperand(0))
167            .addOperand(MI->getOperand(1))
168            .addOperand(MI->getOperand(4))
169            .addOperand(MI->getOperand(5))
170            .addOperand(MI->getOperand(6))
171            .addReg(T0, RegState::Implicit)
172            .addReg(T1, RegState::Implicit);
173    break;
174  }
175
176  case AMDGPU::TXD_SHADOW: {
177    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
178    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
179
180    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
181            .addOperand(MI->getOperand(3))
182            .addOperand(MI->getOperand(4))
183            .addOperand(MI->getOperand(5))
184            .addOperand(MI->getOperand(6));
185    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
186            .addOperand(MI->getOperand(2))
187            .addOperand(MI->getOperand(4))
188            .addOperand(MI->getOperand(5))
189            .addOperand(MI->getOperand(6));
190    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
191            .addOperand(MI->getOperand(0))
192            .addOperand(MI->getOperand(1))
193            .addOperand(MI->getOperand(4))
194            .addOperand(MI->getOperand(5))
195            .addOperand(MI->getOperand(6))
196            .addReg(T0, RegState::Implicit)
197            .addReg(T1, RegState::Implicit);
198    break;
199  }
200
201  case AMDGPU::BRANCH:
202      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
203              .addOperand(MI->getOperand(0))
204              .addReg(0);
205      break;
206
207  case AMDGPU::BRANCH_COND_f32: {
208    MachineInstr *NewMI =
209      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
210              AMDGPU::PREDICATE_BIT)
211              .addOperand(MI->getOperand(1))
212              .addImm(OPCODE_IS_NOT_ZERO)
213              .addImm(0); // Flags
214    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
215    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
216            .addOperand(MI->getOperand(0))
217            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
218    break;
219  }
220
221  case AMDGPU::BRANCH_COND_i32: {
222    MachineInstr *NewMI =
223      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
224            AMDGPU::PREDICATE_BIT)
225            .addOperand(MI->getOperand(1))
226            .addImm(OPCODE_IS_NOT_ZERO_INT)
227            .addImm(0); // Flags
228    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
229    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
230           .addOperand(MI->getOperand(0))
231            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
232    break;
233  }
234
235  case AMDGPU::EG_ExportSwz:
236  case AMDGPU::R600_ExportSwz: {
237    // Instruction is left unmodified if its not the last one of its type
238    bool isLastInstructionOfItsType = true;
239    unsigned InstExportType = MI->getOperand(1).getImm();
240    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
241         EndBlock = BB->end(); NextExportInst != EndBlock;
242         NextExportInst = llvm::next(NextExportInst)) {
243      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
244          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
245        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
246            .getImm();
247        if (CurrentInstExportType == InstExportType) {
248          isLastInstructionOfItsType = false;
249          break;
250        }
251      }
252    }
253    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
254    if (!EOP && !isLastInstructionOfItsType)
255      return BB;
256    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
257    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
258            .addOperand(MI->getOperand(0))
259            .addOperand(MI->getOperand(1))
260            .addOperand(MI->getOperand(2))
261            .addOperand(MI->getOperand(3))
262            .addOperand(MI->getOperand(4))
263            .addOperand(MI->getOperand(5))
264            .addOperand(MI->getOperand(6))
265            .addImm(CfInst)
266            .addImm(EOP);
267    break;
268  }
269  }
270
271  MI->eraseFromParent();
272  return BB;
273}
274
275//===----------------------------------------------------------------------===//
276// Custom DAG Lowering Operations
277//===----------------------------------------------------------------------===//
278
279using namespace llvm::Intrinsic;
280using namespace llvm::AMDGPUIntrinsic;
281
282static SDValue
283InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
284    unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
285    SDValue Scalar, SDValue Chain) {
286  if (!ExportMap[Slot]) {
287    SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
288      DL, MVT::v4f32,
289      DAG.getUNDEF(MVT::v4f32),
290      Scalar,
291      DAG.getConstant(Channel, MVT::i32));
292
293    unsigned Mask = 1 << Channel;
294
295    const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
296        DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
297        DAG.getConstant(Mask, MVT::i32)};
298
299    SDValue Res =  DAG.getNode(
300        AMDGPUISD::EXPORT,
301        DL,
302        MVT::Other,
303        Ops, 6);
304     ExportMap[Slot] = Res.getNode();
305     return Res;
306  }
307
308  SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
309  SDValue PreviousVector = ExportInstruction->getOperand(1);
310  SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
311      DL, MVT::v4f32,
312      PreviousVector,
313      Scalar,
314      DAG.getConstant(Channel, MVT::i32));
315
316  unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
317      ->getZExtValue();
318  Mask |= (1 << Channel);
319
320  const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
321      DAG.getConstant(Inst, MVT::i32),
322      DAG.getConstant(Type, MVT::i32),
323      DAG.getConstant(Slot, MVT::i32),
324      DAG.getConstant(Mask, MVT::i32)};
325
326  DAG.UpdateNodeOperands(ExportInstruction,
327      Ops, 6);
328
329  return Chain;
330
331}
332
333SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
334  switch (Op.getOpcode()) {
335  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
336  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
337  case ISD::ROTL: return LowerROTL(Op, DAG);
338  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
339  case ISD::SELECT: return LowerSELECT(Op, DAG);
340  case ISD::SETCC: return LowerSETCC(Op, DAG);
341  case ISD::STORE: return LowerSTORE(Op, DAG);
342  case ISD::LOAD: return LowerLOAD(Op, DAG);
343  case ISD::FPOW: return LowerFPOW(Op, DAG);
344  case ISD::INTRINSIC_VOID: {
345    SDValue Chain = Op.getOperand(0);
346    unsigned IntrinsicID =
347                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
348    switch (IntrinsicID) {
349    case AMDGPUIntrinsic::AMDGPU_store_output: {
350      MachineFunction &MF = DAG.getMachineFunction();
351      MachineRegisterInfo &MRI = MF.getRegInfo();
352      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
353      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
354      if (!MRI.isLiveOut(Reg)) {
355        MRI.addLiveOut(Reg);
356      }
357      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
358    }
359    case AMDGPUIntrinsic::R600_store_pixel_color: {
360      MachineFunction &MF = DAG.getMachineFunction();
361      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
362      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
363
364      SDNode **OutputsMap = MFI->Outputs;
365      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
366          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
367          Chain);
368
369    }
370
371    // default for switch(IntrinsicID)
372    default: break;
373    }
374    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
375    break;
376  }
377  case ISD::INTRINSIC_WO_CHAIN: {
378    unsigned IntrinsicID =
379                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
380    EVT VT = Op.getValueType();
381    DebugLoc DL = Op.getDebugLoc();
382    switch(IntrinsicID) {
383    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
384    case AMDGPUIntrinsic::R600_load_input: {
385      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
386      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
387      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
388    }
389
390    case AMDGPUIntrinsic::R600_interp_input: {
391      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
392      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
393      MachineSDNode *interp;
394      if (ijb < 0) {
395        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
396            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
397        return DAG.getTargetExtractSubreg(
398            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
399            DL, MVT::f32, SDValue(interp, 0));
400      }
401
402      if (slot % 4 < 2)
403        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
404            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
405            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
406                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
407            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
408                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
409      else
410        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
411            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
412            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
413                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
414            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
415                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
416
417      return SDValue(interp, slot % 2);
418    }
419
420    case r600_read_ngroups_x:
421      return LowerImplicitParameter(DAG, VT, DL, 0);
422    case r600_read_ngroups_y:
423      return LowerImplicitParameter(DAG, VT, DL, 1);
424    case r600_read_ngroups_z:
425      return LowerImplicitParameter(DAG, VT, DL, 2);
426    case r600_read_global_size_x:
427      return LowerImplicitParameter(DAG, VT, DL, 3);
428    case r600_read_global_size_y:
429      return LowerImplicitParameter(DAG, VT, DL, 4);
430    case r600_read_global_size_z:
431      return LowerImplicitParameter(DAG, VT, DL, 5);
432    case r600_read_local_size_x:
433      return LowerImplicitParameter(DAG, VT, DL, 6);
434    case r600_read_local_size_y:
435      return LowerImplicitParameter(DAG, VT, DL, 7);
436    case r600_read_local_size_z:
437      return LowerImplicitParameter(DAG, VT, DL, 8);
438
439    case r600_read_tgid_x:
440      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
441                                  AMDGPU::T1_X, VT);
442    case r600_read_tgid_y:
443      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
444                                  AMDGPU::T1_Y, VT);
445    case r600_read_tgid_z:
446      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
447                                  AMDGPU::T1_Z, VT);
448    case r600_read_tidig_x:
449      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
450                                  AMDGPU::T0_X, VT);
451    case r600_read_tidig_y:
452      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
453                                  AMDGPU::T0_Y, VT);
454    case r600_read_tidig_z:
455      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
456                                  AMDGPU::T0_Z, VT);
457    }
458    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
459    break;
460  }
461  } // end switch(Op.getOpcode())
462  return SDValue();
463}
464
465void R600TargetLowering::ReplaceNodeResults(SDNode *N,
466                                            SmallVectorImpl<SDValue> &Results,
467                                            SelectionDAG &DAG) const {
468  switch (N->getOpcode()) {
469  default: return;
470  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
471    return;
472  case ISD::LOAD: {
473    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
474    Results.push_back(SDValue(Node, 0));
475    Results.push_back(SDValue(Node, 1));
476    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
477    // function
478    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
479    return;
480  }
481  }
482}
483
484SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
485  return DAG.getNode(
486      ISD::SETCC,
487      Op.getDebugLoc(),
488      MVT::i1,
489      Op, DAG.getConstantFP(0.0f, MVT::f32),
490      DAG.getCondCode(ISD::SETNE)
491      );
492}
493
494SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
495  SDValue Chain = Op.getOperand(0);
496  SDValue CC = Op.getOperand(1);
497  SDValue LHS   = Op.getOperand(2);
498  SDValue RHS   = Op.getOperand(3);
499  SDValue JumpT  = Op.getOperand(4);
500  SDValue CmpValue;
501  SDValue Result;
502
503  if (LHS.getValueType() == MVT::i32) {
504    CmpValue = DAG.getNode(
505        ISD::SELECT_CC,
506        Op.getDebugLoc(),
507        MVT::i32,
508        LHS, RHS,
509        DAG.getConstant(-1, MVT::i32),
510        DAG.getConstant(0, MVT::i32),
511        CC);
512  } else if (LHS.getValueType() == MVT::f32) {
513    CmpValue = DAG.getNode(
514        ISD::SELECT_CC,
515        Op.getDebugLoc(),
516        MVT::f32,
517        LHS, RHS,
518        DAG.getConstantFP(1.0f, MVT::f32),
519        DAG.getConstantFP(0.0f, MVT::f32),
520        CC);
521  } else {
522    assert(0 && "Not valid type for br_cc");
523  }
524  Result = DAG.getNode(
525      AMDGPUISD::BRANCH_COND,
526      CmpValue.getDebugLoc(),
527      MVT::Other, Chain,
528      JumpT, CmpValue);
529  return Result;
530}
531
532SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
533                                                   DebugLoc DL,
534                                                   unsigned DwordOffset) const {
535  unsigned ByteOffset = DwordOffset * 4;
536  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
537                                      AMDGPUAS::PARAM_I_ADDRESS);
538
539  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
540  assert(isInt<16>(ByteOffset));
541
542  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
543                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
544                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
545                     false, false, false, 0);
546}
547
548SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
549  DebugLoc DL = Op.getDebugLoc();
550  EVT VT = Op.getValueType();
551
552  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
553                     Op.getOperand(0),
554                     Op.getOperand(0),
555                     DAG.getNode(ISD::SUB, DL, VT,
556                                 DAG.getConstant(32, MVT::i32),
557                                 Op.getOperand(1)));
558}
559
560bool R600TargetLowering::isZero(SDValue Op) const {
561  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
562    return Cst->isNullValue();
563  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
564    return CstFP->isZero();
565  } else {
566    return false;
567  }
568}
569
570SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
571  DebugLoc DL = Op.getDebugLoc();
572  EVT VT = Op.getValueType();
573
574  SDValue LHS = Op.getOperand(0);
575  SDValue RHS = Op.getOperand(1);
576  SDValue True = Op.getOperand(2);
577  SDValue False = Op.getOperand(3);
578  SDValue CC = Op.getOperand(4);
579  SDValue Temp;
580
581  // LHS and RHS are guaranteed to be the same value type
582  EVT CompareVT = LHS.getValueType();
583
584  // Check if we can lower this to a native operation.
585
586  // Try to lower to a CND* instruction:
587  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
588  // can be lowered to CND* instructions can also be lowered to SET*
589  // instructions.  CND* instructions are cheaper, because they dont't
590  // require additional instructions to convert their result to the correct
591  // value type, so this check should be first.
592  if (isZero(LHS) || isZero(RHS)) {
593    SDValue Cond = (isZero(LHS) ? RHS : LHS);
594    SDValue Zero = (isZero(LHS) ? LHS : RHS);
595    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
596    if (CompareVT != VT) {
597      // Bitcast True / False to the correct types.  This will end up being
598      // a nop, but it allows us to define only a single pattern in the
599      // .TD files for each CND* instruction rather than having to have
600      // one pattern for integer True/False and one for fp True/False
601      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
602      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
603    }
604    if (isZero(LHS)) {
605      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
606    }
607
608    switch (CCOpcode) {
609    case ISD::SETONE:
610    case ISD::SETUNE:
611    case ISD::SETNE:
612    case ISD::SETULE:
613    case ISD::SETULT:
614    case ISD::SETOLE:
615    case ISD::SETOLT:
616    case ISD::SETLE:
617    case ISD::SETLT:
618      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
619      Temp = True;
620      True = False;
621      False = Temp;
622      break;
623    default:
624      break;
625    }
626    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
627        Cond, Zero,
628        True, False,
629        DAG.getCondCode(CCOpcode));
630    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
631  }
632
633  // Try to lower to a SET* instruction:
634  // We need all the operands of SELECT_CC to have the same value type, so if
635  // necessary we need to change True and False to be the same type as LHS and
636  // RHS, and then convert the result of the select_cc back to the correct type.
637
638  // Move hardware True/False values to the correct operand.
639  if (isHWTrueValue(False) && isHWFalseValue(True)) {
640    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
641    std::swap(False, True);
642    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
643  }
644
645  if (isHWTrueValue(True) && isHWFalseValue(False)) {
646    if (CompareVT !=  VT) {
647      if (VT == MVT::f32 && CompareVT == MVT::i32) {
648        SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
649            LHS, RHS,
650            DAG.getConstant(-1, MVT::i32),
651            DAG.getConstant(0, MVT::i32),
652            CC);
653        // Convert integer values of true (-1) and false (0) to fp values of
654        // true (1.0f) and false (0.0f).
655        SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
656                                                  DAG.getConstant(1, MVT::i32));
657        return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
658      } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
659        SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
660            LHS, RHS,
661            DAG.getConstantFP(1.0f, MVT::f32),
662            DAG.getConstantFP(0.0f, MVT::f32),
663            CC);
664        // Convert fp values of true (1.0f) and false (0.0f) to integer values
665        // of true (-1) and false (0).
666        SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
667        return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
668      } else {
669        // I don't think there will be any other type pairings.
670        assert(!"Unhandled operand type parings in SELECT_CC");
671      }
672    } else {
673      // This SELECT_CC is already legal.
674      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
675    }
676  }
677
678  // Possible Min/Max pattern
679  SDValue MinMax = LowerMinMax(Op, DAG);
680  if (MinMax.getNode()) {
681    return MinMax;
682  }
683
684  // If we make it this for it means we have no native instructions to handle
685  // this SELECT_CC, so we must lower it.
686  SDValue HWTrue, HWFalse;
687
688  if (CompareVT == MVT::f32) {
689    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
690    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
691  } else if (CompareVT == MVT::i32) {
692    HWTrue = DAG.getConstant(-1, CompareVT);
693    HWFalse = DAG.getConstant(0, CompareVT);
694  }
695  else {
696    assert(!"Unhandled value type in LowerSELECT_CC");
697  }
698
699  // Lower this unsupported SELECT_CC into a combination of two supported
700  // SELECT_CC operations.
701  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
702
703  return DAG.getNode(ISD::SELECT_CC, DL, VT,
704      Cond, HWFalse,
705      True, False,
706      DAG.getCondCode(ISD::SETNE));
707}
708
709SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
710  return DAG.getNode(ISD::SELECT_CC,
711      Op.getDebugLoc(),
712      Op.getValueType(),
713      Op.getOperand(0),
714      DAG.getConstant(0, MVT::i32),
715      Op.getOperand(1),
716      Op.getOperand(2),
717      DAG.getCondCode(ISD::SETNE));
718}
719
720SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
721  SDValue Cond;
722  SDValue LHS = Op.getOperand(0);
723  SDValue RHS = Op.getOperand(1);
724  SDValue CC  = Op.getOperand(2);
725  DebugLoc DL = Op.getDebugLoc();
726  assert(Op.getValueType() == MVT::i32);
727  if (LHS.getValueType() == MVT::i32) {
728    Cond = DAG.getNode(
729        ISD::SELECT_CC,
730        Op.getDebugLoc(),
731        MVT::i32,
732        LHS, RHS,
733        DAG.getConstant(-1, MVT::i32),
734        DAG.getConstant(0, MVT::i32),
735        CC);
736  } else if (LHS.getValueType() == MVT::f32) {
737    Cond = DAG.getNode(
738        ISD::SELECT_CC,
739        Op.getDebugLoc(),
740        MVT::f32,
741        LHS, RHS,
742        DAG.getConstantFP(1.0f, MVT::f32),
743        DAG.getConstantFP(0.0f, MVT::f32),
744        CC);
745    Cond = DAG.getNode(
746        ISD::FP_TO_SINT,
747        DL,
748        MVT::i32,
749        Cond);
750  } else {
751    assert(0 && "Not valid type for set_cc");
752  }
753  Cond = DAG.getNode(
754      ISD::AND,
755      DL,
756      MVT::i32,
757      DAG.getConstant(1, MVT::i32),
758      Cond);
759  return Cond;
760}
761
762SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
763  DebugLoc DL = Op.getDebugLoc();
764  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
765  SDValue Chain = Op.getOperand(0);
766  SDValue Value = Op.getOperand(1);
767  SDValue Ptr = Op.getOperand(2);
768
769  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
770      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
771    // Convert pointer from byte address to dword address.
772    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
773                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
774                                  Ptr, DAG.getConstant(2, MVT::i32)));
775
776    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
777      assert(!"Truncated and indexed stores not supported yet");
778    } else {
779      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
780    }
781    return Chain;
782  }
783  return SDValue();
784}
785
786// return (512 + (kc_bank << 12)
787static int
788ConstantAddressBlock(unsigned AddressSpace) {
789  switch (AddressSpace) {
790  case AMDGPUAS::CONSTANT_BUFFER_0:
791    return 512;
792  case AMDGPUAS::CONSTANT_BUFFER_1:
793    return 512 + 4096;
794  case AMDGPUAS::CONSTANT_BUFFER_2:
795    return 512 + 4096 * 2;
796  case AMDGPUAS::CONSTANT_BUFFER_3:
797    return 512 + 4096 * 3;
798  case AMDGPUAS::CONSTANT_BUFFER_4:
799    return 512 + 4096 * 4;
800  case AMDGPUAS::CONSTANT_BUFFER_5:
801    return 512 + 4096 * 5;
802  case AMDGPUAS::CONSTANT_BUFFER_6:
803    return 512 + 4096 * 6;
804  case AMDGPUAS::CONSTANT_BUFFER_7:
805    return 512 + 4096 * 7;
806  case AMDGPUAS::CONSTANT_BUFFER_8:
807    return 512 + 4096 * 8;
808  case AMDGPUAS::CONSTANT_BUFFER_9:
809    return 512 + 4096 * 9;
810  case AMDGPUAS::CONSTANT_BUFFER_10:
811    return 512 + 4096 * 10;
812  case AMDGPUAS::CONSTANT_BUFFER_11:
813    return 512 + 4096 * 11;
814  case AMDGPUAS::CONSTANT_BUFFER_12:
815    return 512 + 4096 * 12;
816  case AMDGPUAS::CONSTANT_BUFFER_13:
817    return 512 + 4096 * 13;
818  case AMDGPUAS::CONSTANT_BUFFER_14:
819    return 512 + 4096 * 14;
820  case AMDGPUAS::CONSTANT_BUFFER_15:
821    return 512 + 4096 * 15;
822  default:
823    return -1;
824  }
825}
826
827SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
828{
829  EVT VT = Op.getValueType();
830  DebugLoc DL = Op.getDebugLoc();
831  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
832  SDValue Chain = Op.getOperand(0);
833  SDValue Ptr = Op.getOperand(1);
834  SDValue LoweredLoad;
835
836  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
837  if (ConstantBlock > -1) {
838    SDValue Result;
839    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
840        dyn_cast<Constant>(LoadNode->getSrcValue())) {
841      SDValue Slots[4];
842      for (unsigned i = 0; i < 4; i++) {
843        // We want Const position encoded with the following formula :
844        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
845        // const_index is Ptr computed by llvm using an alignment of 16.
846        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
847        // then div by 4 at the ISel step
848        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
849            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
850        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
851      }
852      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
853    } else {
854      // non constant ptr cant be folded, keeps it as a v4f32 load
855      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
856          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
857          );
858    }
859
860    if (!VT.isVector()) {
861      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
862          DAG.getConstant(0, MVT::i32));
863    }
864
865    SDValue MergedValues[2] = {
866        Result,
867        Chain
868    };
869    return DAG.getMergeValues(MergedValues, 2, DL);
870  }
871
872  return SDValue();
873}
874
875SDValue R600TargetLowering::LowerFPOW(SDValue Op,
876    SelectionDAG &DAG) const {
877  DebugLoc DL = Op.getDebugLoc();
878  EVT VT = Op.getValueType();
879  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
880  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
881  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
882}
883
884/// XXX Only kernel functions are supported, so we can assume for now that
885/// every function is a kernel function, but in the future we should use
886/// separate calling conventions for kernel and non-kernel functions.
887SDValue R600TargetLowering::LowerFormalArguments(
888                                      SDValue Chain,
889                                      CallingConv::ID CallConv,
890                                      bool isVarArg,
891                                      const SmallVectorImpl<ISD::InputArg> &Ins,
892                                      DebugLoc DL, SelectionDAG &DAG,
893                                      SmallVectorImpl<SDValue> &InVals) const {
894  unsigned ParamOffsetBytes = 36;
895  Function::const_arg_iterator FuncArg =
896                            DAG.getMachineFunction().getFunction()->arg_begin();
897  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
898    EVT VT = Ins[i].VT;
899    Type *ArgType = FuncArg->getType();
900    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
901                             32 : ArgType->getPrimitiveSizeInBits();
902    unsigned ArgBytes = ArgSizeInBits >> 3;
903    EVT ArgVT;
904    if (ArgSizeInBits < VT.getSizeInBits()) {
905      assert(!ArgType->isFloatTy() &&
906             "Extending floating point arguments not supported yet");
907      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
908    } else {
909      ArgVT = VT;
910    }
911    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
912                                                    AMDGPUAS::PARAM_I_ADDRESS);
913    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
914                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
915                                       MachinePointerInfo(new Argument(PtrTy)),
916                                       ArgVT, false, false, ArgBytes);
917    InVals.push_back(Arg);
918    ParamOffsetBytes += ArgBytes;
919  }
920  return Chain;
921}
922
923EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
924   if (!VT.isVector()) return MVT::i32;
925   return VT.changeVectorElementTypeToInteger();
926}
927
928//===----------------------------------------------------------------------===//
929// Custom DAG Optimizations
930//===----------------------------------------------------------------------===//
931
932SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
933                                              DAGCombinerInfo &DCI) const {
934  SelectionDAG &DAG = DCI.DAG;
935
936  switch (N->getOpcode()) {
937  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
938  case ISD::FP_ROUND: {
939      SDValue Arg = N->getOperand(0);
940      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
941        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
942                           Arg.getOperand(0));
943      }
944      break;
945    }
946  // Extract_vec (Build_vector) generated by custom lowering
947  // also needs to be customly combined
948  case ISD::EXTRACT_VECTOR_ELT: {
949    SDValue Arg = N->getOperand(0);
950    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
951      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
952        unsigned Element = Const->getZExtValue();
953        return Arg->getOperand(Element);
954      }
955    }
956    if (Arg.getOpcode() == ISD::BITCAST &&
957        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
958      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
959        unsigned Element = Const->getZExtValue();
960        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
961            Arg->getOperand(0).getOperand(Element));
962      }
963    }
964  }
965  }
966  return SDValue();
967}
968