R600ISelLowering.cpp revision b5632b5b456db647b42239cbd4d8b58c82290c4e
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineFrameInfo.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/MachineRegisterInfo.h"
22#include "llvm/CodeGen/SelectionDAG.h"
23#include "llvm/IR/Argument.h"
24#include "llvm/IR/Function.h"
25
26using namespace llvm;
27
28R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
29    AMDGPUTargetLowering(TM) {
30  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
31  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
32  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
34  computeRegisterProperties();
35
36  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
37  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
38  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
39  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
40
41  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
42  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
43  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
44  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
45  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
46  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
47  setOperationAction(ISD::OR, MVT::v4i32, Expand);
48  setOperationAction(ISD::OR, MVT::v2i32, Expand);
49  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
50  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
51  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
52  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
53  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
54  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
55  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
56  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
57  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
58  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
59  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
60  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
61  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
62  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
63  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
64
65  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
66  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
67
68  setOperationAction(ISD::FSUB, MVT::f32, Expand);
69
70  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
71  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
73
74  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
75  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
76
77  setOperationAction(ISD::SETCC, MVT::i32, Expand);
78  setOperationAction(ISD::SETCC, MVT::f32, Expand);
79  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
80
81  setOperationAction(ISD::SELECT, MVT::i32, Custom);
82  setOperationAction(ISD::SELECT, MVT::f32, Custom);
83
84  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
85  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
86
87  // Legalize loads and stores to the private address space.
88  setOperationAction(ISD::LOAD, MVT::i32, Custom);
89  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
90  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
91  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
92  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
93  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
94  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
95  setOperationAction(ISD::STORE, MVT::i8, Custom);
96  setOperationAction(ISD::STORE, MVT::i32, Custom);
97  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
98  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
99
100  setOperationAction(ISD::LOAD, MVT::i32, Custom);
101  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
102  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
103
104  setTargetDAGCombine(ISD::FP_ROUND);
105  setTargetDAGCombine(ISD::FP_TO_SINT);
106  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
107  setTargetDAGCombine(ISD::SELECT_CC);
108
109  setBooleanContents(ZeroOrNegativeOneBooleanContent);
110  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
111  setSchedulingPreference(Sched::VLIW);
112}
113
114MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
115    MachineInstr * MI, MachineBasicBlock * BB) const {
116  MachineFunction * MF = BB->getParent();
117  MachineRegisterInfo &MRI = MF->getRegInfo();
118  MachineBasicBlock::iterator I = *MI;
119  const R600InstrInfo *TII =
120    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
121
122  switch (MI->getOpcode()) {
123  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
124  case AMDGPU::CLAMP_R600: {
125    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
126                                                   AMDGPU::MOV,
127                                                   MI->getOperand(0).getReg(),
128                                                   MI->getOperand(1).getReg());
129    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
130    break;
131  }
132
133  case AMDGPU::FABS_R600: {
134    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
135                                                    AMDGPU::MOV,
136                                                    MI->getOperand(0).getReg(),
137                                                    MI->getOperand(1).getReg());
138    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
139    break;
140  }
141
142  case AMDGPU::FNEG_R600: {
143    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
144                                                    AMDGPU::MOV,
145                                                    MI->getOperand(0).getReg(),
146                                                    MI->getOperand(1).getReg());
147    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
148    break;
149  }
150
151  case AMDGPU::MASK_WRITE: {
152    unsigned maskedRegister = MI->getOperand(0).getReg();
153    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
154    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
155    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
156    break;
157  }
158
159  case AMDGPU::MOV_IMM_F32:
160    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
161                     MI->getOperand(1).getFPImm()->getValueAPF()
162                         .bitcastToAPInt().getZExtValue());
163    break;
164  case AMDGPU::MOV_IMM_I32:
165    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
166                     MI->getOperand(1).getImm());
167    break;
168  case AMDGPU::CONST_COPY: {
169    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
170        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
171    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
172        MI->getOperand(1).getImm());
173    break;
174  }
175
176  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
177  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
178    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
179
180    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
181            .addOperand(MI->getOperand(0))
182            .addOperand(MI->getOperand(1))
183            .addImm(EOP); // Set End of program bit
184    break;
185  }
186
187  case AMDGPU::TXD: {
188    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
189    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
190    MachineOperand &RID = MI->getOperand(4);
191    MachineOperand &SID = MI->getOperand(5);
192    unsigned TextureId = MI->getOperand(6).getImm();
193    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
194    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
195
196    switch (TextureId) {
197    case 5: // Rect
198      CTX = CTY = 0;
199      break;
200    case 6: // Shadow1D
201      SrcW = SrcZ;
202      break;
203    case 7: // Shadow2D
204      SrcW = SrcZ;
205      break;
206    case 8: // ShadowRect
207      CTX = CTY = 0;
208      SrcW = SrcZ;
209      break;
210    case 9: // 1DArray
211      SrcZ = SrcY;
212      CTZ = 0;
213      break;
214    case 10: // 2DArray
215      CTZ = 0;
216      break;
217    case 11: // Shadow1DArray
218      SrcZ = SrcY;
219      CTZ = 0;
220      break;
221    case 12: // Shadow2DArray
222      CTZ = 0;
223      break;
224    }
225    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
226            .addOperand(MI->getOperand(3))
227            .addImm(SrcX)
228            .addImm(SrcY)
229            .addImm(SrcZ)
230            .addImm(SrcW)
231            .addImm(0)
232            .addImm(0)
233            .addImm(0)
234            .addImm(0)
235            .addImm(1)
236            .addImm(2)
237            .addImm(3)
238            .addOperand(RID)
239            .addOperand(SID)
240            .addImm(CTX)
241            .addImm(CTY)
242            .addImm(CTZ)
243            .addImm(CTW);
244    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
245            .addOperand(MI->getOperand(2))
246            .addImm(SrcX)
247            .addImm(SrcY)
248            .addImm(SrcZ)
249            .addImm(SrcW)
250            .addImm(0)
251            .addImm(0)
252            .addImm(0)
253            .addImm(0)
254            .addImm(1)
255            .addImm(2)
256            .addImm(3)
257            .addOperand(RID)
258            .addOperand(SID)
259            .addImm(CTX)
260            .addImm(CTY)
261            .addImm(CTZ)
262            .addImm(CTW);
263    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
264            .addOperand(MI->getOperand(0))
265            .addOperand(MI->getOperand(1))
266            .addImm(SrcX)
267            .addImm(SrcY)
268            .addImm(SrcZ)
269            .addImm(SrcW)
270            .addImm(0)
271            .addImm(0)
272            .addImm(0)
273            .addImm(0)
274            .addImm(1)
275            .addImm(2)
276            .addImm(3)
277            .addOperand(RID)
278            .addOperand(SID)
279            .addImm(CTX)
280            .addImm(CTY)
281            .addImm(CTZ)
282            .addImm(CTW)
283            .addReg(T0, RegState::Implicit)
284            .addReg(T1, RegState::Implicit);
285    break;
286  }
287
288  case AMDGPU::TXD_SHADOW: {
289    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
290    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
291    MachineOperand &RID = MI->getOperand(4);
292    MachineOperand &SID = MI->getOperand(5);
293    unsigned TextureId = MI->getOperand(6).getImm();
294    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
295    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
296
297    switch (TextureId) {
298    case 5: // Rect
299      CTX = CTY = 0;
300      break;
301    case 6: // Shadow1D
302      SrcW = SrcZ;
303      break;
304    case 7: // Shadow2D
305      SrcW = SrcZ;
306      break;
307    case 8: // ShadowRect
308      CTX = CTY = 0;
309      SrcW = SrcZ;
310      break;
311    case 9: // 1DArray
312      SrcZ = SrcY;
313      CTZ = 0;
314      break;
315    case 10: // 2DArray
316      CTZ = 0;
317      break;
318    case 11: // Shadow1DArray
319      SrcZ = SrcY;
320      CTZ = 0;
321      break;
322    case 12: // Shadow2DArray
323      CTZ = 0;
324      break;
325    }
326
327    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
328            .addOperand(MI->getOperand(3))
329            .addImm(SrcX)
330            .addImm(SrcY)
331            .addImm(SrcZ)
332            .addImm(SrcW)
333            .addImm(0)
334            .addImm(0)
335            .addImm(0)
336            .addImm(0)
337            .addImm(1)
338            .addImm(2)
339            .addImm(3)
340            .addOperand(RID)
341            .addOperand(SID)
342            .addImm(CTX)
343            .addImm(CTY)
344            .addImm(CTZ)
345            .addImm(CTW);
346    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
347            .addOperand(MI->getOperand(2))
348            .addImm(SrcX)
349            .addImm(SrcY)
350            .addImm(SrcZ)
351            .addImm(SrcW)
352            .addImm(0)
353            .addImm(0)
354            .addImm(0)
355            .addImm(0)
356            .addImm(1)
357            .addImm(2)
358            .addImm(3)
359            .addOperand(RID)
360            .addOperand(SID)
361            .addImm(CTX)
362            .addImm(CTY)
363            .addImm(CTZ)
364            .addImm(CTW);
365    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
366            .addOperand(MI->getOperand(0))
367            .addOperand(MI->getOperand(1))
368            .addImm(SrcX)
369            .addImm(SrcY)
370            .addImm(SrcZ)
371            .addImm(SrcW)
372            .addImm(0)
373            .addImm(0)
374            .addImm(0)
375            .addImm(0)
376            .addImm(1)
377            .addImm(2)
378            .addImm(3)
379            .addOperand(RID)
380            .addOperand(SID)
381            .addImm(CTX)
382            .addImm(CTY)
383            .addImm(CTZ)
384            .addImm(CTW)
385            .addReg(T0, RegState::Implicit)
386            .addReg(T1, RegState::Implicit);
387    break;
388  }
389
390  case AMDGPU::BRANCH:
391      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
392              .addOperand(MI->getOperand(0));
393      break;
394
395  case AMDGPU::BRANCH_COND_f32: {
396    MachineInstr *NewMI =
397      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
398              AMDGPU::PREDICATE_BIT)
399              .addOperand(MI->getOperand(1))
400              .addImm(OPCODE_IS_NOT_ZERO)
401              .addImm(0); // Flags
402    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
403    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
404            .addOperand(MI->getOperand(0))
405            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
406    break;
407  }
408
409  case AMDGPU::BRANCH_COND_i32: {
410    MachineInstr *NewMI =
411      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
412            AMDGPU::PREDICATE_BIT)
413            .addOperand(MI->getOperand(1))
414            .addImm(OPCODE_IS_NOT_ZERO_INT)
415            .addImm(0); // Flags
416    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
417    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
418           .addOperand(MI->getOperand(0))
419            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
420    break;
421  }
422
423  case AMDGPU::EG_ExportSwz:
424  case AMDGPU::R600_ExportSwz: {
425    // Instruction is left unmodified if its not the last one of its type
426    bool isLastInstructionOfItsType = true;
427    unsigned InstExportType = MI->getOperand(1).getImm();
428    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
429         EndBlock = BB->end(); NextExportInst != EndBlock;
430         NextExportInst = llvm::next(NextExportInst)) {
431      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
432          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
433        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
434            .getImm();
435        if (CurrentInstExportType == InstExportType) {
436          isLastInstructionOfItsType = false;
437          break;
438        }
439      }
440    }
441    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
442    if (!EOP && !isLastInstructionOfItsType)
443      return BB;
444    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
445    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
446            .addOperand(MI->getOperand(0))
447            .addOperand(MI->getOperand(1))
448            .addOperand(MI->getOperand(2))
449            .addOperand(MI->getOperand(3))
450            .addOperand(MI->getOperand(4))
451            .addOperand(MI->getOperand(5))
452            .addOperand(MI->getOperand(6))
453            .addImm(CfInst)
454            .addImm(EOP);
455    break;
456  }
457  case AMDGPU::RETURN: {
458    // RETURN instructions must have the live-out registers as implicit uses,
459    // otherwise they appear dead.
460    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
461    MachineInstrBuilder MIB(*MF, MI);
462    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
463      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
464    return BB;
465  }
466  }
467
468  MI->eraseFromParent();
469  return BB;
470}
471
472//===----------------------------------------------------------------------===//
473// Custom DAG Lowering Operations
474//===----------------------------------------------------------------------===//
475
476SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
477  switch (Op.getOpcode()) {
478  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
479  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
480  case ISD::SELECT: return LowerSELECT(Op, DAG);
481  case ISD::STORE: return LowerSTORE(Op, DAG);
482  case ISD::LOAD: return LowerLOAD(Op, DAG);
483  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
484  case ISD::INTRINSIC_VOID: {
485    SDValue Chain = Op.getOperand(0);
486    unsigned IntrinsicID =
487                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
488    switch (IntrinsicID) {
489    case AMDGPUIntrinsic::AMDGPU_store_output: {
490      MachineFunction &MF = DAG.getMachineFunction();
491      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
492      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
493      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
494      MFI->LiveOuts.push_back(Reg);
495      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
496    }
497    case AMDGPUIntrinsic::R600_store_swizzle: {
498      const SDValue Args[8] = {
499        Chain,
500        Op.getOperand(2), // Export Value
501        Op.getOperand(3), // ArrayBase
502        Op.getOperand(4), // Type
503        DAG.getConstant(0, MVT::i32), // SWZ_X
504        DAG.getConstant(1, MVT::i32), // SWZ_Y
505        DAG.getConstant(2, MVT::i32), // SWZ_Z
506        DAG.getConstant(3, MVT::i32) // SWZ_W
507      };
508      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
509          Args, 8);
510    }
511
512    // default for switch(IntrinsicID)
513    default: break;
514    }
515    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
516    break;
517  }
518  case ISD::INTRINSIC_WO_CHAIN: {
519    unsigned IntrinsicID =
520                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
521    EVT VT = Op.getValueType();
522    SDLoc DL(Op);
523    switch(IntrinsicID) {
524    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
525    case AMDGPUIntrinsic::R600_load_input: {
526      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
527      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
528      MachineFunction &MF = DAG.getMachineFunction();
529      MachineRegisterInfo &MRI = MF.getRegInfo();
530      MRI.addLiveIn(Reg);
531      return DAG.getCopyFromReg(DAG.getEntryNode(),
532          SDLoc(DAG.getEntryNode()), Reg, VT);
533    }
534
535    case AMDGPUIntrinsic::R600_interp_input: {
536      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
537      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
538      MachineSDNode *interp;
539      if (ijb < 0) {
540        const MachineFunction &MF = DAG.getMachineFunction();
541        const R600InstrInfo *TII =
542          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
543        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
544            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
545        return DAG.getTargetExtractSubreg(
546            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
547            DL, MVT::f32, SDValue(interp, 0));
548      }
549
550      MachineFunction &MF = DAG.getMachineFunction();
551      MachineRegisterInfo &MRI = MF.getRegInfo();
552      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
553      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
554      MRI.addLiveIn(RegisterI);
555      MRI.addLiveIn(RegisterJ);
556      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
557          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
558      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
559          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
560
561      if (slot % 4 < 2)
562        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
563            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
564            RegisterJNode, RegisterINode);
565      else
566        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
567            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
568            RegisterJNode, RegisterINode);
569      return SDValue(interp, slot % 2);
570    }
571    case AMDGPUIntrinsic::R600_tex:
572    case AMDGPUIntrinsic::R600_texc:
573    case AMDGPUIntrinsic::R600_txl:
574    case AMDGPUIntrinsic::R600_txlc:
575    case AMDGPUIntrinsic::R600_txb:
576    case AMDGPUIntrinsic::R600_txbc:
577    case AMDGPUIntrinsic::R600_txf:
578    case AMDGPUIntrinsic::R600_txq:
579    case AMDGPUIntrinsic::R600_ddx:
580    case AMDGPUIntrinsic::R600_ddy: {
581      unsigned TextureOp;
582      switch (IntrinsicID) {
583      case AMDGPUIntrinsic::R600_tex:
584        TextureOp = 0;
585        break;
586      case AMDGPUIntrinsic::R600_texc:
587        TextureOp = 1;
588        break;
589      case AMDGPUIntrinsic::R600_txl:
590        TextureOp = 2;
591        break;
592      case AMDGPUIntrinsic::R600_txlc:
593        TextureOp = 3;
594        break;
595      case AMDGPUIntrinsic::R600_txb:
596        TextureOp = 4;
597        break;
598      case AMDGPUIntrinsic::R600_txbc:
599        TextureOp = 5;
600        break;
601      case AMDGPUIntrinsic::R600_txf:
602        TextureOp = 6;
603        break;
604      case AMDGPUIntrinsic::R600_txq:
605        TextureOp = 7;
606        break;
607      case AMDGPUIntrinsic::R600_ddx:
608        TextureOp = 8;
609        break;
610      case AMDGPUIntrinsic::R600_ddy:
611        TextureOp = 9;
612        break;
613      default:
614        llvm_unreachable("Unknow Texture Operation");
615      }
616
617      SDValue TexArgs[19] = {
618        DAG.getConstant(TextureOp, MVT::i32),
619        Op.getOperand(1),
620        DAG.getConstant(0, MVT::i32),
621        DAG.getConstant(1, MVT::i32),
622        DAG.getConstant(2, MVT::i32),
623        DAG.getConstant(3, MVT::i32),
624        Op.getOperand(2),
625        Op.getOperand(3),
626        Op.getOperand(4),
627        DAG.getConstant(0, MVT::i32),
628        DAG.getConstant(1, MVT::i32),
629        DAG.getConstant(2, MVT::i32),
630        DAG.getConstant(3, MVT::i32),
631        Op.getOperand(5),
632        Op.getOperand(6),
633        Op.getOperand(7),
634        Op.getOperand(8),
635        Op.getOperand(9),
636        Op.getOperand(10)
637      };
638      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
639    }
640    case AMDGPUIntrinsic::AMDGPU_dp4: {
641      SDValue Args[8] = {
642      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
643          DAG.getConstant(0, MVT::i32)),
644      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
645          DAG.getConstant(0, MVT::i32)),
646      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
647          DAG.getConstant(1, MVT::i32)),
648      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
649          DAG.getConstant(1, MVT::i32)),
650      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
651          DAG.getConstant(2, MVT::i32)),
652      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
653          DAG.getConstant(2, MVT::i32)),
654      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
655          DAG.getConstant(3, MVT::i32)),
656      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
657          DAG.getConstant(3, MVT::i32))
658      };
659      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
660    }
661
662    case Intrinsic::r600_read_ngroups_x:
663      return LowerImplicitParameter(DAG, VT, DL, 0);
664    case Intrinsic::r600_read_ngroups_y:
665      return LowerImplicitParameter(DAG, VT, DL, 1);
666    case Intrinsic::r600_read_ngroups_z:
667      return LowerImplicitParameter(DAG, VT, DL, 2);
668    case Intrinsic::r600_read_global_size_x:
669      return LowerImplicitParameter(DAG, VT, DL, 3);
670    case Intrinsic::r600_read_global_size_y:
671      return LowerImplicitParameter(DAG, VT, DL, 4);
672    case Intrinsic::r600_read_global_size_z:
673      return LowerImplicitParameter(DAG, VT, DL, 5);
674    case Intrinsic::r600_read_local_size_x:
675      return LowerImplicitParameter(DAG, VT, DL, 6);
676    case Intrinsic::r600_read_local_size_y:
677      return LowerImplicitParameter(DAG, VT, DL, 7);
678    case Intrinsic::r600_read_local_size_z:
679      return LowerImplicitParameter(DAG, VT, DL, 8);
680
681    case Intrinsic::r600_read_tgid_x:
682      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
683                                  AMDGPU::T1_X, VT);
684    case Intrinsic::r600_read_tgid_y:
685      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
686                                  AMDGPU::T1_Y, VT);
687    case Intrinsic::r600_read_tgid_z:
688      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
689                                  AMDGPU::T1_Z, VT);
690    case Intrinsic::r600_read_tidig_x:
691      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
692                                  AMDGPU::T0_X, VT);
693    case Intrinsic::r600_read_tidig_y:
694      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
695                                  AMDGPU::T0_Y, VT);
696    case Intrinsic::r600_read_tidig_z:
697      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
698                                  AMDGPU::T0_Z, VT);
699    }
700    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
701    break;
702  }
703  } // end switch(Op.getOpcode())
704  return SDValue();
705}
706
707void R600TargetLowering::ReplaceNodeResults(SDNode *N,
708                                            SmallVectorImpl<SDValue> &Results,
709                                            SelectionDAG &DAG) const {
710  switch (N->getOpcode()) {
711  default: return;
712  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
713    return;
714  case ISD::LOAD: {
715    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
716    Results.push_back(SDValue(Node, 0));
717    Results.push_back(SDValue(Node, 1));
718    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
719    // function
720    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
721    return;
722  }
723  case ISD::STORE:
724    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
725    Results.push_back(SDValue(Node, 0));
726    return;
727  }
728}
729
730SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
731  return DAG.getNode(
732      ISD::SETCC,
733      SDLoc(Op),
734      MVT::i1,
735      Op, DAG.getConstantFP(0.0f, MVT::f32),
736      DAG.getCondCode(ISD::SETNE)
737      );
738}
739
740SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
741                                                   SDLoc DL,
742                                                   unsigned DwordOffset) const {
743  unsigned ByteOffset = DwordOffset * 4;
744  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
745                                      AMDGPUAS::PARAM_I_ADDRESS);
746
747  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
748  assert(isInt<16>(ByteOffset));
749
750  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
751                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
752                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
753                     false, false, false, 0);
754}
755
756SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
757
758  MachineFunction &MF = DAG.getMachineFunction();
759  const AMDGPUFrameLowering *TFL =
760   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
761
762  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
763  assert(FIN);
764
765  unsigned FrameIndex = FIN->getIndex();
766  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
767  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
768}
769
770bool R600TargetLowering::isZero(SDValue Op) const {
771  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
772    return Cst->isNullValue();
773  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
774    return CstFP->isZero();
775  } else {
776    return false;
777  }
778}
779
780SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
781  SDLoc DL(Op);
782  EVT VT = Op.getValueType();
783
784  SDValue LHS = Op.getOperand(0);
785  SDValue RHS = Op.getOperand(1);
786  SDValue True = Op.getOperand(2);
787  SDValue False = Op.getOperand(3);
788  SDValue CC = Op.getOperand(4);
789  SDValue Temp;
790
791  // LHS and RHS are guaranteed to be the same value type
792  EVT CompareVT = LHS.getValueType();
793
794  // Check if we can lower this to a native operation.
795
796  // Try to lower to a SET* instruction:
797  //
798  // SET* can match the following patterns:
799  //
800  // select_cc f32, f32, -1,  0, cc_any
801  // select_cc f32, f32, 1.0f, 0.0f, cc_any
802  // select_cc i32, i32, -1,  0, cc_any
803  //
804
805  // Move hardware True/False values to the correct operand.
806  if (isHWTrueValue(False) && isHWFalseValue(True)) {
807    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
808    std::swap(False, True);
809    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
810  }
811
812  if (isHWTrueValue(True) && isHWFalseValue(False) &&
813      (CompareVT == VT || VT == MVT::i32)) {
814    // This can be matched by a SET* instruction.
815    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
816  }
817
818  // Try to lower to a CND* instruction:
819  //
820  // CND* can match the following patterns:
821  //
822  // select_cc f32, 0.0, f32, f32, cc_any
823  // select_cc f32, 0.0, i32, i32, cc_any
824  // select_cc i32, 0,   f32, f32, cc_any
825  // select_cc i32, 0,   i32, i32, cc_any
826  //
827  if (isZero(LHS) || isZero(RHS)) {
828    SDValue Cond = (isZero(LHS) ? RHS : LHS);
829    SDValue Zero = (isZero(LHS) ? LHS : RHS);
830    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
831    if (CompareVT != VT) {
832      // Bitcast True / False to the correct types.  This will end up being
833      // a nop, but it allows us to define only a single pattern in the
834      // .TD files for each CND* instruction rather than having to have
835      // one pattern for integer True/False and one for fp True/False
836      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
837      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
838    }
839    if (isZero(LHS)) {
840      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
841    }
842
843    switch (CCOpcode) {
844    case ISD::SETONE:
845    case ISD::SETUNE:
846    case ISD::SETNE:
847    case ISD::SETULE:
848    case ISD::SETULT:
849    case ISD::SETOLE:
850    case ISD::SETOLT:
851    case ISD::SETLE:
852    case ISD::SETLT:
853      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
854      Temp = True;
855      True = False;
856      False = Temp;
857      break;
858    default:
859      break;
860    }
861    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
862        Cond, Zero,
863        True, False,
864        DAG.getCondCode(CCOpcode));
865    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
866  }
867
868
869  // Possible Min/Max pattern
870  SDValue MinMax = LowerMinMax(Op, DAG);
871  if (MinMax.getNode()) {
872    return MinMax;
873  }
874
875  // If we make it this for it means we have no native instructions to handle
876  // this SELECT_CC, so we must lower it.
877  SDValue HWTrue, HWFalse;
878
879  if (CompareVT == MVT::f32) {
880    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
881    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
882  } else if (CompareVT == MVT::i32) {
883    HWTrue = DAG.getConstant(-1, CompareVT);
884    HWFalse = DAG.getConstant(0, CompareVT);
885  }
886  else {
887    assert(!"Unhandled value type in LowerSELECT_CC");
888  }
889
890  // Lower this unsupported SELECT_CC into a combination of two supported
891  // SELECT_CC operations.
892  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
893
894  return DAG.getNode(ISD::SELECT_CC, DL, VT,
895      Cond, HWFalse,
896      True, False,
897      DAG.getCondCode(ISD::SETNE));
898}
899
900SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
901  return DAG.getNode(ISD::SELECT_CC,
902      SDLoc(Op),
903      Op.getValueType(),
904      Op.getOperand(0),
905      DAG.getConstant(0, MVT::i32),
906      Op.getOperand(1),
907      Op.getOperand(2),
908      DAG.getCondCode(ISD::SETNE));
909}
910
911/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
912/// convert these pointers to a register index.  Each register holds
913/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
914/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
915/// for indirect addressing.
916SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
917                                               unsigned StackWidth,
918                                               SelectionDAG &DAG) const {
919  unsigned SRLPad;
920  switch(StackWidth) {
921  case 1:
922    SRLPad = 2;
923    break;
924  case 2:
925    SRLPad = 3;
926    break;
927  case 4:
928    SRLPad = 4;
929    break;
930  default: llvm_unreachable("Invalid stack width");
931  }
932
933  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
934                     DAG.getConstant(SRLPad, MVT::i32));
935}
936
937void R600TargetLowering::getStackAddress(unsigned StackWidth,
938                                         unsigned ElemIdx,
939                                         unsigned &Channel,
940                                         unsigned &PtrIncr) const {
941  switch (StackWidth) {
942  default:
943  case 1:
944    Channel = 0;
945    if (ElemIdx > 0) {
946      PtrIncr = 1;
947    } else {
948      PtrIncr = 0;
949    }
950    break;
951  case 2:
952    Channel = ElemIdx % 2;
953    if (ElemIdx == 2) {
954      PtrIncr = 1;
955    } else {
956      PtrIncr = 0;
957    }
958    break;
959  case 4:
960    Channel = ElemIdx;
961    PtrIncr = 0;
962    break;
963  }
964}
965
966SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
967  SDLoc DL(Op);
968  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
969  SDValue Chain = Op.getOperand(0);
970  SDValue Value = Op.getOperand(1);
971  SDValue Ptr = Op.getOperand(2);
972
973  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
974      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
975    // Convert pointer from byte address to dword address.
976    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
977                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
978                                  Ptr, DAG.getConstant(2, MVT::i32)));
979
980    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
981      assert(!"Truncated and indexed stores not supported yet");
982    } else {
983      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
984    }
985    return Chain;
986  }
987
988  EVT ValueVT = Value.getValueType();
989
990  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
991    return SDValue();
992  }
993
994  // Lowering for indirect addressing
995
996  const MachineFunction &MF = DAG.getMachineFunction();
997  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
998                                         getTargetMachine().getFrameLowering());
999  unsigned StackWidth = TFL->getStackWidth(MF);
1000
1001  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1002
1003  if (ValueVT.isVector()) {
1004    unsigned NumElemVT = ValueVT.getVectorNumElements();
1005    EVT ElemVT = ValueVT.getVectorElementType();
1006    SDValue Stores[4];
1007
1008    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1009                                      "vector width in load");
1010
1011    for (unsigned i = 0; i < NumElemVT; ++i) {
1012      unsigned Channel, PtrIncr;
1013      getStackAddress(StackWidth, i, Channel, PtrIncr);
1014      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1015                        DAG.getConstant(PtrIncr, MVT::i32));
1016      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1017                                 Value, DAG.getConstant(i, MVT::i32));
1018
1019      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1020                              Chain, Elem, Ptr,
1021                              DAG.getTargetConstant(Channel, MVT::i32));
1022    }
1023     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1024   } else {
1025    if (ValueVT == MVT::i8) {
1026      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1027    }
1028    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1029    DAG.getTargetConstant(0, MVT::i32)); // Channel
1030  }
1031
1032  return Chain;
1033}
1034
1035// return (512 + (kc_bank << 12)
1036static int
1037ConstantAddressBlock(unsigned AddressSpace) {
1038  switch (AddressSpace) {
1039  case AMDGPUAS::CONSTANT_BUFFER_0:
1040    return 512;
1041  case AMDGPUAS::CONSTANT_BUFFER_1:
1042    return 512 + 4096;
1043  case AMDGPUAS::CONSTANT_BUFFER_2:
1044    return 512 + 4096 * 2;
1045  case AMDGPUAS::CONSTANT_BUFFER_3:
1046    return 512 + 4096 * 3;
1047  case AMDGPUAS::CONSTANT_BUFFER_4:
1048    return 512 + 4096 * 4;
1049  case AMDGPUAS::CONSTANT_BUFFER_5:
1050    return 512 + 4096 * 5;
1051  case AMDGPUAS::CONSTANT_BUFFER_6:
1052    return 512 + 4096 * 6;
1053  case AMDGPUAS::CONSTANT_BUFFER_7:
1054    return 512 + 4096 * 7;
1055  case AMDGPUAS::CONSTANT_BUFFER_8:
1056    return 512 + 4096 * 8;
1057  case AMDGPUAS::CONSTANT_BUFFER_9:
1058    return 512 + 4096 * 9;
1059  case AMDGPUAS::CONSTANT_BUFFER_10:
1060    return 512 + 4096 * 10;
1061  case AMDGPUAS::CONSTANT_BUFFER_11:
1062    return 512 + 4096 * 11;
1063  case AMDGPUAS::CONSTANT_BUFFER_12:
1064    return 512 + 4096 * 12;
1065  case AMDGPUAS::CONSTANT_BUFFER_13:
1066    return 512 + 4096 * 13;
1067  case AMDGPUAS::CONSTANT_BUFFER_14:
1068    return 512 + 4096 * 14;
1069  case AMDGPUAS::CONSTANT_BUFFER_15:
1070    return 512 + 4096 * 15;
1071  default:
1072    return -1;
1073  }
1074}
1075
1076SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1077{
1078  EVT VT = Op.getValueType();
1079  SDLoc DL(Op);
1080  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1081  SDValue Chain = Op.getOperand(0);
1082  SDValue Ptr = Op.getOperand(1);
1083  SDValue LoweredLoad;
1084
1085  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1086  if (ConstantBlock > -1) {
1087    SDValue Result;
1088    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1089        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1090        dyn_cast<ConstantSDNode>(Ptr)) {
1091      SDValue Slots[4];
1092      for (unsigned i = 0; i < 4; i++) {
1093        // We want Const position encoded with the following formula :
1094        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1095        // const_index is Ptr computed by llvm using an alignment of 16.
1096        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1097        // then div by 4 at the ISel step
1098        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1099            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1100        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1101      }
1102      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1103    } else {
1104      // non constant ptr cant be folded, keeps it as a v4f32 load
1105      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1106          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1107          DAG.getConstant(LoadNode->getAddressSpace() -
1108                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1109          );
1110    }
1111
1112    if (!VT.isVector()) {
1113      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1114          DAG.getConstant(0, MVT::i32));
1115    }
1116
1117    SDValue MergedValues[2] = {
1118        Result,
1119        Chain
1120    };
1121    return DAG.getMergeValues(MergedValues, 2, DL);
1122  }
1123
1124  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1125    return SDValue();
1126  }
1127
1128  // Lowering for indirect addressing
1129  const MachineFunction &MF = DAG.getMachineFunction();
1130  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1131                                         getTargetMachine().getFrameLowering());
1132  unsigned StackWidth = TFL->getStackWidth(MF);
1133
1134  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1135
1136  if (VT.isVector()) {
1137    unsigned NumElemVT = VT.getVectorNumElements();
1138    EVT ElemVT = VT.getVectorElementType();
1139    SDValue Loads[4];
1140
1141    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1142                                      "vector width in load");
1143
1144    for (unsigned i = 0; i < NumElemVT; ++i) {
1145      unsigned Channel, PtrIncr;
1146      getStackAddress(StackWidth, i, Channel, PtrIncr);
1147      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1148                        DAG.getConstant(PtrIncr, MVT::i32));
1149      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1150                             Chain, Ptr,
1151                             DAG.getTargetConstant(Channel, MVT::i32),
1152                             Op.getOperand(2));
1153    }
1154    for (unsigned i = NumElemVT; i < 4; ++i) {
1155      Loads[i] = DAG.getUNDEF(ElemVT);
1156    }
1157    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1158    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1159  } else {
1160    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1161                              Chain, Ptr,
1162                              DAG.getTargetConstant(0, MVT::i32), // Channel
1163                              Op.getOperand(2));
1164  }
1165
1166  SDValue Ops[2];
1167  Ops[0] = LoweredLoad;
1168  Ops[1] = Chain;
1169
1170  return DAG.getMergeValues(Ops, 2, DL);
1171}
1172
1173/// XXX Only kernel functions are supported, so we can assume for now that
1174/// every function is a kernel function, but in the future we should use
1175/// separate calling conventions for kernel and non-kernel functions.
1176SDValue R600TargetLowering::LowerFormalArguments(
1177                                      SDValue Chain,
1178                                      CallingConv::ID CallConv,
1179                                      bool isVarArg,
1180                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1181                                      SDLoc DL, SelectionDAG &DAG,
1182                                      SmallVectorImpl<SDValue> &InVals) const {
1183  unsigned ParamOffsetBytes = 36;
1184  Function::const_arg_iterator FuncArg =
1185                            DAG.getMachineFunction().getFunction()->arg_begin();
1186  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1187    EVT VT = Ins[i].VT;
1188    Type *ArgType = FuncArg->getType();
1189    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1190                             32 : ArgType->getPrimitiveSizeInBits();
1191    unsigned ArgBytes = ArgSizeInBits >> 3;
1192    EVT ArgVT;
1193    if (ArgSizeInBits < VT.getSizeInBits()) {
1194      assert(!ArgType->isFloatTy() &&
1195             "Extending floating point arguments not supported yet");
1196      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1197    } else {
1198      ArgVT = VT;
1199    }
1200    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1201                                                    AMDGPUAS::PARAM_I_ADDRESS);
1202    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1203                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
1204                                       MachinePointerInfo(UndefValue::get(PtrTy)),
1205                                       ArgVT, false, false, ArgBytes);
1206    InVals.push_back(Arg);
1207    ParamOffsetBytes += ArgBytes;
1208  }
1209  return Chain;
1210}
1211
1212EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1213   if (!VT.isVector()) return MVT::i32;
1214   return VT.changeVectorElementTypeToInteger();
1215}
1216
1217SDValue CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1218                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1219  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1220  assert(RemapSwizzle.empty());
1221  SDValue NewBldVec[4] = {
1222      VectorEntry.getOperand(0),
1223      VectorEntry.getOperand(1),
1224      VectorEntry.getOperand(2),
1225      VectorEntry.getOperand(3)
1226  };
1227
1228  for (unsigned i = 0; i < 4; i++) {
1229    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1230      if (C->isZero()) {
1231        RemapSwizzle[i] = 4; // SEL_0
1232        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1233      } else if (C->isExactlyValue(1.0)) {
1234        RemapSwizzle[i] = 5; // SEL_1
1235        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1236      }
1237    }
1238
1239    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1240      continue;
1241    for (unsigned j = 0; j < i; j++) {
1242      if (NewBldVec[i] == NewBldVec[j]) {
1243        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1244        RemapSwizzle[i] = j;
1245        break;
1246      }
1247    }
1248  }
1249
1250  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1251      VectorEntry.getValueType(), NewBldVec, 4);
1252}
1253
1254SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1255                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1256  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1257  assert(RemapSwizzle.empty());
1258  SDValue NewBldVec[4] = {
1259      VectorEntry.getOperand(0),
1260      VectorEntry.getOperand(1),
1261      VectorEntry.getOperand(2),
1262      VectorEntry.getOperand(3)
1263  };
1264  bool isUnmovable[4] = { false, false, false, false };
1265
1266  for (unsigned i = 0; i < 4; i++) {
1267    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1268      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1269          ->getZExtValue();
1270      if (!isUnmovable[Idx]) {
1271        // Swap i and Idx
1272        std::swap(NewBldVec[Idx], NewBldVec[i]);
1273        RemapSwizzle[Idx] = i;
1274        RemapSwizzle[i] = Idx;
1275      }
1276      isUnmovable[Idx] = true;
1277    }
1278  }
1279
1280  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1281      VectorEntry.getValueType(), NewBldVec, 4);
1282}
1283
1284
1285SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1286SDValue Swz[4], SelectionDAG &DAG) const {
1287  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1288  // Old -> New swizzle values
1289  DenseMap<unsigned, unsigned> SwizzleRemap;
1290
1291  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1292  for (unsigned i = 0; i < 4; i++) {
1293    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1294    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1295      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1296  }
1297
1298  SwizzleRemap.clear();
1299  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1300  for (unsigned i = 0; i < 4; i++) {
1301    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1302    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1303      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1304  }
1305
1306  return BuildVector;
1307}
1308
1309
1310//===----------------------------------------------------------------------===//
1311// Custom DAG Optimizations
1312//===----------------------------------------------------------------------===//
1313
1314SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1315                                              DAGCombinerInfo &DCI) const {
1316  SelectionDAG &DAG = DCI.DAG;
1317
1318  switch (N->getOpcode()) {
1319  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1320  case ISD::FP_ROUND: {
1321      SDValue Arg = N->getOperand(0);
1322      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1323        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1324                           Arg.getOperand(0));
1325      }
1326      break;
1327    }
1328
1329  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1330  // (i32 select_cc f32, f32, -1, 0 cc)
1331  //
1332  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1333  // this to one of the SET*_DX10 instructions.
1334  case ISD::FP_TO_SINT: {
1335    SDValue FNeg = N->getOperand(0);
1336    if (FNeg.getOpcode() != ISD::FNEG) {
1337      return SDValue();
1338    }
1339    SDValue SelectCC = FNeg.getOperand(0);
1340    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1341        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1342        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1343        !isHWTrueValue(SelectCC.getOperand(2)) ||
1344        !isHWFalseValue(SelectCC.getOperand(3))) {
1345      return SDValue();
1346    }
1347
1348    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1349                           SelectCC.getOperand(0), // LHS
1350                           SelectCC.getOperand(1), // RHS
1351                           DAG.getConstant(-1, MVT::i32), // True
1352                           DAG.getConstant(0, MVT::i32),  // Flase
1353                           SelectCC.getOperand(4)); // CC
1354
1355    break;
1356  }
1357  // Extract_vec (Build_vector) generated by custom lowering
1358  // also needs to be customly combined
1359  case ISD::EXTRACT_VECTOR_ELT: {
1360    SDValue Arg = N->getOperand(0);
1361    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1362      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1363        unsigned Element = Const->getZExtValue();
1364        return Arg->getOperand(Element);
1365      }
1366    }
1367    if (Arg.getOpcode() == ISD::BITCAST &&
1368        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1369      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1370        unsigned Element = Const->getZExtValue();
1371        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1372            Arg->getOperand(0).getOperand(Element));
1373      }
1374    }
1375  }
1376
1377  case ISD::SELECT_CC: {
1378    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1379    //      selectcc x, y, a, b, inv(cc)
1380    //
1381    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1382    //      selectcc x, y, a, b, cc
1383    SDValue LHS = N->getOperand(0);
1384    if (LHS.getOpcode() != ISD::SELECT_CC) {
1385      return SDValue();
1386    }
1387
1388    SDValue RHS = N->getOperand(1);
1389    SDValue True = N->getOperand(2);
1390    SDValue False = N->getOperand(3);
1391    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1392
1393    if (LHS.getOperand(2).getNode() != True.getNode() ||
1394        LHS.getOperand(3).getNode() != False.getNode() ||
1395        RHS.getNode() != False.getNode()) {
1396      return SDValue();
1397    }
1398
1399    switch (NCC) {
1400    default: return SDValue();
1401    case ISD::SETNE: return LHS;
1402    case ISD::SETEQ: {
1403      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1404      LHSCC = ISD::getSetCCInverse(LHSCC,
1405                                  LHS.getOperand(0).getValueType().isInteger());
1406      return DAG.getSelectCC(SDLoc(N),
1407                             LHS.getOperand(0),
1408                             LHS.getOperand(1),
1409                             LHS.getOperand(2),
1410                             LHS.getOperand(3),
1411                             LHSCC);
1412    }
1413    }
1414  }
1415  case AMDGPUISD::EXPORT: {
1416    SDValue Arg = N->getOperand(1);
1417    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1418      break;
1419
1420    SDValue NewArgs[8] = {
1421      N->getOperand(0), // Chain
1422      SDValue(),
1423      N->getOperand(2), // ArrayBase
1424      N->getOperand(3), // Type
1425      N->getOperand(4), // SWZ_X
1426      N->getOperand(5), // SWZ_Y
1427      N->getOperand(6), // SWZ_Z
1428      N->getOperand(7) // SWZ_W
1429    };
1430    SDLoc DL(N);
1431    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1432    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1433  }
1434  case AMDGPUISD::TEXTURE_FETCH: {
1435    SDValue Arg = N->getOperand(1);
1436    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1437      break;
1438
1439    SDValue NewArgs[19] = {
1440      N->getOperand(0),
1441      N->getOperand(1),
1442      N->getOperand(2),
1443      N->getOperand(3),
1444      N->getOperand(4),
1445      N->getOperand(5),
1446      N->getOperand(6),
1447      N->getOperand(7),
1448      N->getOperand(8),
1449      N->getOperand(9),
1450      N->getOperand(10),
1451      N->getOperand(11),
1452      N->getOperand(12),
1453      N->getOperand(13),
1454      N->getOperand(14),
1455      N->getOperand(15),
1456      N->getOperand(16),
1457      N->getOperand(17),
1458      N->getOperand(18),
1459    };
1460    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1461    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1462        NewArgs, 19);
1463  }
1464  }
1465  return SDValue();
1466}
1467