R600ISelLowering.cpp revision 0962e147a439785279c3665379189017e980e0cc
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineFrameInfo.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/MachineRegisterInfo.h"
22#include "llvm/CodeGen/SelectionDAG.h"
23#include "llvm/IR/Argument.h"
24#include "llvm/IR/Function.h"
25
26using namespace llvm;
27
28R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
29    AMDGPUTargetLowering(TM),
30    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
31  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
32  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
33  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
34  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35  computeRegisterProperties();
36
37  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
38  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
39  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
40  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
41
42  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
43  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
44  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
45  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
46  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
47  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
48  setOperationAction(ISD::OR, MVT::v4i32, Expand);
49  setOperationAction(ISD::OR, MVT::v2i32, Expand);
50  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
51  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
52  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
53  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
54  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
55  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
56  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
57  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
58  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
59  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
60  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
61  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
62  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
63  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
64  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
65
66  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68
69  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70
71  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74
75  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
76  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
77
78  setOperationAction(ISD::SETCC, MVT::i32, Expand);
79  setOperationAction(ISD::SETCC, MVT::f32, Expand);
80  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
81
82  setOperationAction(ISD::SELECT, MVT::i32, Custom);
83  setOperationAction(ISD::SELECT, MVT::f32, Custom);
84
85  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
86  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
87
88  // Legalize loads and stores to the private address space.
89  setOperationAction(ISD::LOAD, MVT::i32, Custom);
90  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
91  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
92  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
93  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
94  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
95  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
96  setOperationAction(ISD::STORE, MVT::i8, Custom);
97  setOperationAction(ISD::STORE, MVT::i32, Custom);
98  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
99  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
100
101  setOperationAction(ISD::LOAD, MVT::i32, Custom);
102  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
103  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
104
105  setTargetDAGCombine(ISD::FP_ROUND);
106  setTargetDAGCombine(ISD::FP_TO_SINT);
107  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
108  setTargetDAGCombine(ISD::SELECT_CC);
109
110  setBooleanContents(ZeroOrNegativeOneBooleanContent);
111  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
112  setSchedulingPreference(Sched::VLIW);
113}
114
115MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
116    MachineInstr * MI, MachineBasicBlock * BB) const {
117  MachineFunction * MF = BB->getParent();
118  MachineRegisterInfo &MRI = MF->getRegInfo();
119  MachineBasicBlock::iterator I = *MI;
120
121  switch (MI->getOpcode()) {
122  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
123  case AMDGPU::CLAMP_R600: {
124    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
125                                                   AMDGPU::MOV,
126                                                   MI->getOperand(0).getReg(),
127                                                   MI->getOperand(1).getReg());
128    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
129    break;
130  }
131
132  case AMDGPU::FABS_R600: {
133    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
134                                                    AMDGPU::MOV,
135                                                    MI->getOperand(0).getReg(),
136                                                    MI->getOperand(1).getReg());
137    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
138    break;
139  }
140
141  case AMDGPU::FNEG_R600: {
142    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
143                                                    AMDGPU::MOV,
144                                                    MI->getOperand(0).getReg(),
145                                                    MI->getOperand(1).getReg());
146    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
147    break;
148  }
149
150  case AMDGPU::MASK_WRITE: {
151    unsigned maskedRegister = MI->getOperand(0).getReg();
152    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
153    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
154    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
155    break;
156  }
157
158  case AMDGPU::MOV_IMM_F32:
159    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
160                     MI->getOperand(1).getFPImm()->getValueAPF()
161                         .bitcastToAPInt().getZExtValue());
162    break;
163  case AMDGPU::MOV_IMM_I32:
164    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
165                     MI->getOperand(1).getImm());
166    break;
167  case AMDGPU::CONST_COPY: {
168    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
169        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
170    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
171        MI->getOperand(1).getImm());
172    break;
173  }
174
175  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
176  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
177    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
178
179    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
180            .addOperand(MI->getOperand(0))
181            .addOperand(MI->getOperand(1))
182            .addImm(EOP); // Set End of program bit
183    break;
184  }
185
186  case AMDGPU::TXD: {
187    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
188    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
189    MachineOperand &RID = MI->getOperand(4);
190    MachineOperand &SID = MI->getOperand(5);
191    unsigned TextureId = MI->getOperand(6).getImm();
192    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
193    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
194
195    switch (TextureId) {
196    case 5: // Rect
197      CTX = CTY = 0;
198      break;
199    case 6: // Shadow1D
200      SrcW = SrcZ;
201      break;
202    case 7: // Shadow2D
203      SrcW = SrcZ;
204      break;
205    case 8: // ShadowRect
206      CTX = CTY = 0;
207      SrcW = SrcZ;
208      break;
209    case 9: // 1DArray
210      SrcZ = SrcY;
211      CTZ = 0;
212      break;
213    case 10: // 2DArray
214      CTZ = 0;
215      break;
216    case 11: // Shadow1DArray
217      SrcZ = SrcY;
218      CTZ = 0;
219      break;
220    case 12: // Shadow2DArray
221      CTZ = 0;
222      break;
223    }
224    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
225            .addOperand(MI->getOperand(3))
226            .addImm(SrcX)
227            .addImm(SrcY)
228            .addImm(SrcZ)
229            .addImm(SrcW)
230            .addImm(0)
231            .addImm(0)
232            .addImm(0)
233            .addImm(0)
234            .addImm(1)
235            .addImm(2)
236            .addImm(3)
237            .addOperand(RID)
238            .addOperand(SID)
239            .addImm(CTX)
240            .addImm(CTY)
241            .addImm(CTZ)
242            .addImm(CTW);
243    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
244            .addOperand(MI->getOperand(2))
245            .addImm(SrcX)
246            .addImm(SrcY)
247            .addImm(SrcZ)
248            .addImm(SrcW)
249            .addImm(0)
250            .addImm(0)
251            .addImm(0)
252            .addImm(0)
253            .addImm(1)
254            .addImm(2)
255            .addImm(3)
256            .addOperand(RID)
257            .addOperand(SID)
258            .addImm(CTX)
259            .addImm(CTY)
260            .addImm(CTZ)
261            .addImm(CTW);
262    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
263            .addOperand(MI->getOperand(0))
264            .addOperand(MI->getOperand(1))
265            .addImm(SrcX)
266            .addImm(SrcY)
267            .addImm(SrcZ)
268            .addImm(SrcW)
269            .addImm(0)
270            .addImm(0)
271            .addImm(0)
272            .addImm(0)
273            .addImm(1)
274            .addImm(2)
275            .addImm(3)
276            .addOperand(RID)
277            .addOperand(SID)
278            .addImm(CTX)
279            .addImm(CTY)
280            .addImm(CTZ)
281            .addImm(CTW)
282            .addReg(T0, RegState::Implicit)
283            .addReg(T1, RegState::Implicit);
284    break;
285  }
286
287  case AMDGPU::TXD_SHADOW: {
288    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
289    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
290    MachineOperand &RID = MI->getOperand(4);
291    MachineOperand &SID = MI->getOperand(5);
292    unsigned TextureId = MI->getOperand(6).getImm();
293    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
294    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
295
296    switch (TextureId) {
297    case 5: // Rect
298      CTX = CTY = 0;
299      break;
300    case 6: // Shadow1D
301      SrcW = SrcZ;
302      break;
303    case 7: // Shadow2D
304      SrcW = SrcZ;
305      break;
306    case 8: // ShadowRect
307      CTX = CTY = 0;
308      SrcW = SrcZ;
309      break;
310    case 9: // 1DArray
311      SrcZ = SrcY;
312      CTZ = 0;
313      break;
314    case 10: // 2DArray
315      CTZ = 0;
316      break;
317    case 11: // Shadow1DArray
318      SrcZ = SrcY;
319      CTZ = 0;
320      break;
321    case 12: // Shadow2DArray
322      CTZ = 0;
323      break;
324    }
325
326    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
327            .addOperand(MI->getOperand(3))
328            .addImm(SrcX)
329            .addImm(SrcY)
330            .addImm(SrcZ)
331            .addImm(SrcW)
332            .addImm(0)
333            .addImm(0)
334            .addImm(0)
335            .addImm(0)
336            .addImm(1)
337            .addImm(2)
338            .addImm(3)
339            .addOperand(RID)
340            .addOperand(SID)
341            .addImm(CTX)
342            .addImm(CTY)
343            .addImm(CTZ)
344            .addImm(CTW);
345    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
346            .addOperand(MI->getOperand(2))
347            .addImm(SrcX)
348            .addImm(SrcY)
349            .addImm(SrcZ)
350            .addImm(SrcW)
351            .addImm(0)
352            .addImm(0)
353            .addImm(0)
354            .addImm(0)
355            .addImm(1)
356            .addImm(2)
357            .addImm(3)
358            .addOperand(RID)
359            .addOperand(SID)
360            .addImm(CTX)
361            .addImm(CTY)
362            .addImm(CTZ)
363            .addImm(CTW);
364    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
365            .addOperand(MI->getOperand(0))
366            .addOperand(MI->getOperand(1))
367            .addImm(SrcX)
368            .addImm(SrcY)
369            .addImm(SrcZ)
370            .addImm(SrcW)
371            .addImm(0)
372            .addImm(0)
373            .addImm(0)
374            .addImm(0)
375            .addImm(1)
376            .addImm(2)
377            .addImm(3)
378            .addOperand(RID)
379            .addOperand(SID)
380            .addImm(CTX)
381            .addImm(CTY)
382            .addImm(CTZ)
383            .addImm(CTW)
384            .addReg(T0, RegState::Implicit)
385            .addReg(T1, RegState::Implicit);
386    break;
387  }
388
389  case AMDGPU::BRANCH:
390      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
391              .addOperand(MI->getOperand(0));
392      break;
393
394  case AMDGPU::BRANCH_COND_f32: {
395    MachineInstr *NewMI =
396      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
397              AMDGPU::PREDICATE_BIT)
398              .addOperand(MI->getOperand(1))
399              .addImm(OPCODE_IS_NOT_ZERO)
400              .addImm(0); // Flags
401    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
402    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
403            .addOperand(MI->getOperand(0))
404            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
405    break;
406  }
407
408  case AMDGPU::BRANCH_COND_i32: {
409    MachineInstr *NewMI =
410      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
411            AMDGPU::PREDICATE_BIT)
412            .addOperand(MI->getOperand(1))
413            .addImm(OPCODE_IS_NOT_ZERO_INT)
414            .addImm(0); // Flags
415    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
416    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
417           .addOperand(MI->getOperand(0))
418            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
419    break;
420  }
421
422  case AMDGPU::EG_ExportSwz:
423  case AMDGPU::R600_ExportSwz: {
424    // Instruction is left unmodified if its not the last one of its type
425    bool isLastInstructionOfItsType = true;
426    unsigned InstExportType = MI->getOperand(1).getImm();
427    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
428         EndBlock = BB->end(); NextExportInst != EndBlock;
429         NextExportInst = llvm::next(NextExportInst)) {
430      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
431          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
432        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
433            .getImm();
434        if (CurrentInstExportType == InstExportType) {
435          isLastInstructionOfItsType = false;
436          break;
437        }
438      }
439    }
440    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
441    if (!EOP && !isLastInstructionOfItsType)
442      return BB;
443    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
444    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
445            .addOperand(MI->getOperand(0))
446            .addOperand(MI->getOperand(1))
447            .addOperand(MI->getOperand(2))
448            .addOperand(MI->getOperand(3))
449            .addOperand(MI->getOperand(4))
450            .addOperand(MI->getOperand(5))
451            .addOperand(MI->getOperand(6))
452            .addImm(CfInst)
453            .addImm(EOP);
454    break;
455  }
456  case AMDGPU::RETURN: {
457    // RETURN instructions must have the live-out registers as implicit uses,
458    // otherwise they appear dead.
459    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
460    MachineInstrBuilder MIB(*MF, MI);
461    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
462      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
463    return BB;
464  }
465  }
466
467  MI->eraseFromParent();
468  return BB;
469}
470
471//===----------------------------------------------------------------------===//
472// Custom DAG Lowering Operations
473//===----------------------------------------------------------------------===//
474
475SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
476  switch (Op.getOpcode()) {
477  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
478  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
479  case ISD::SELECT: return LowerSELECT(Op, DAG);
480  case ISD::STORE: return LowerSTORE(Op, DAG);
481  case ISD::LOAD: return LowerLOAD(Op, DAG);
482  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
483  case ISD::INTRINSIC_VOID: {
484    SDValue Chain = Op.getOperand(0);
485    unsigned IntrinsicID =
486                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
487    switch (IntrinsicID) {
488    case AMDGPUIntrinsic::AMDGPU_store_output: {
489      MachineFunction &MF = DAG.getMachineFunction();
490      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
491      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
492      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
493      MFI->LiveOuts.push_back(Reg);
494      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
495    }
496    case AMDGPUIntrinsic::R600_store_swizzle: {
497      const SDValue Args[8] = {
498        Chain,
499        Op.getOperand(2), // Export Value
500        Op.getOperand(3), // ArrayBase
501        Op.getOperand(4), // Type
502        DAG.getConstant(0, MVT::i32), // SWZ_X
503        DAG.getConstant(1, MVT::i32), // SWZ_Y
504        DAG.getConstant(2, MVT::i32), // SWZ_Z
505        DAG.getConstant(3, MVT::i32) // SWZ_W
506      };
507      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
508          Args, 8);
509    }
510
511    // default for switch(IntrinsicID)
512    default: break;
513    }
514    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
515    break;
516  }
517  case ISD::INTRINSIC_WO_CHAIN: {
518    unsigned IntrinsicID =
519                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
520    EVT VT = Op.getValueType();
521    SDLoc DL(Op);
522    switch(IntrinsicID) {
523    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
524    case AMDGPUIntrinsic::R600_load_input: {
525      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
526      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
527      MachineFunction &MF = DAG.getMachineFunction();
528      MachineRegisterInfo &MRI = MF.getRegInfo();
529      MRI.addLiveIn(Reg);
530      return DAG.getCopyFromReg(DAG.getEntryNode(),
531          SDLoc(DAG.getEntryNode()), Reg, VT);
532    }
533
534    case AMDGPUIntrinsic::R600_interp_input: {
535      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
536      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
537      MachineSDNode *interp;
538      if (ijb < 0) {
539        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
540            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
541        return DAG.getTargetExtractSubreg(
542            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
543            DL, MVT::f32, SDValue(interp, 0));
544      }
545
546      MachineFunction &MF = DAG.getMachineFunction();
547      MachineRegisterInfo &MRI = MF.getRegInfo();
548      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
549      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
550      MRI.addLiveIn(RegisterI);
551      MRI.addLiveIn(RegisterJ);
552      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
553          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
554      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
555          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
556
557      if (slot % 4 < 2)
558        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
559            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
560            RegisterJNode, RegisterINode);
561      else
562        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
563            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
564            RegisterJNode, RegisterINode);
565      return SDValue(interp, slot % 2);
566    }
567    case AMDGPUIntrinsic::R600_tex:
568    case AMDGPUIntrinsic::R600_texc:
569    case AMDGPUIntrinsic::R600_txl:
570    case AMDGPUIntrinsic::R600_txlc:
571    case AMDGPUIntrinsic::R600_txb:
572    case AMDGPUIntrinsic::R600_txbc:
573    case AMDGPUIntrinsic::R600_txf:
574    case AMDGPUIntrinsic::R600_txq:
575    case AMDGPUIntrinsic::R600_ddx:
576    case AMDGPUIntrinsic::R600_ddy: {
577      unsigned TextureOp;
578      switch (IntrinsicID) {
579      case AMDGPUIntrinsic::R600_tex:
580        TextureOp = 0;
581        break;
582      case AMDGPUIntrinsic::R600_texc:
583        TextureOp = 1;
584        break;
585      case AMDGPUIntrinsic::R600_txl:
586        TextureOp = 2;
587        break;
588      case AMDGPUIntrinsic::R600_txlc:
589        TextureOp = 3;
590        break;
591      case AMDGPUIntrinsic::R600_txb:
592        TextureOp = 4;
593        break;
594      case AMDGPUIntrinsic::R600_txbc:
595        TextureOp = 5;
596        break;
597      case AMDGPUIntrinsic::R600_txf:
598        TextureOp = 6;
599        break;
600      case AMDGPUIntrinsic::R600_txq:
601        TextureOp = 7;
602        break;
603      case AMDGPUIntrinsic::R600_ddx:
604        TextureOp = 8;
605        break;
606      case AMDGPUIntrinsic::R600_ddy:
607        TextureOp = 9;
608        break;
609      default:
610        llvm_unreachable("Unknow Texture Operation");
611      }
612
613      SDValue TexArgs[19] = {
614        DAG.getConstant(TextureOp, MVT::i32),
615        Op.getOperand(1),
616        DAG.getConstant(0, MVT::i32),
617        DAG.getConstant(1, MVT::i32),
618        DAG.getConstant(2, MVT::i32),
619        DAG.getConstant(3, MVT::i32),
620        Op.getOperand(2),
621        Op.getOperand(3),
622        Op.getOperand(4),
623        DAG.getConstant(0, MVT::i32),
624        DAG.getConstant(1, MVT::i32),
625        DAG.getConstant(2, MVT::i32),
626        DAG.getConstant(3, MVT::i32),
627        Op.getOperand(5),
628        Op.getOperand(6),
629        Op.getOperand(7),
630        Op.getOperand(8),
631        Op.getOperand(9),
632        Op.getOperand(10)
633      };
634      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
635    }
636    case AMDGPUIntrinsic::AMDGPU_dp4: {
637      SDValue Args[8] = {
638      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
639          DAG.getConstant(0, MVT::i32)),
640      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
641          DAG.getConstant(0, MVT::i32)),
642      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
643          DAG.getConstant(1, MVT::i32)),
644      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
645          DAG.getConstant(1, MVT::i32)),
646      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
647          DAG.getConstant(2, MVT::i32)),
648      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
649          DAG.getConstant(2, MVT::i32)),
650      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
651          DAG.getConstant(3, MVT::i32)),
652      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
653          DAG.getConstant(3, MVT::i32))
654      };
655      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
656    }
657
658    case Intrinsic::r600_read_ngroups_x:
659      return LowerImplicitParameter(DAG, VT, DL, 0);
660    case Intrinsic::r600_read_ngroups_y:
661      return LowerImplicitParameter(DAG, VT, DL, 1);
662    case Intrinsic::r600_read_ngroups_z:
663      return LowerImplicitParameter(DAG, VT, DL, 2);
664    case Intrinsic::r600_read_global_size_x:
665      return LowerImplicitParameter(DAG, VT, DL, 3);
666    case Intrinsic::r600_read_global_size_y:
667      return LowerImplicitParameter(DAG, VT, DL, 4);
668    case Intrinsic::r600_read_global_size_z:
669      return LowerImplicitParameter(DAG, VT, DL, 5);
670    case Intrinsic::r600_read_local_size_x:
671      return LowerImplicitParameter(DAG, VT, DL, 6);
672    case Intrinsic::r600_read_local_size_y:
673      return LowerImplicitParameter(DAG, VT, DL, 7);
674    case Intrinsic::r600_read_local_size_z:
675      return LowerImplicitParameter(DAG, VT, DL, 8);
676
677    case Intrinsic::r600_read_tgid_x:
678      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
679                                  AMDGPU::T1_X, VT);
680    case Intrinsic::r600_read_tgid_y:
681      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
682                                  AMDGPU::T1_Y, VT);
683    case Intrinsic::r600_read_tgid_z:
684      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
685                                  AMDGPU::T1_Z, VT);
686    case Intrinsic::r600_read_tidig_x:
687      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
688                                  AMDGPU::T0_X, VT);
689    case Intrinsic::r600_read_tidig_y:
690      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
691                                  AMDGPU::T0_Y, VT);
692    case Intrinsic::r600_read_tidig_z:
693      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
694                                  AMDGPU::T0_Z, VT);
695    }
696    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
697    break;
698  }
699  } // end switch(Op.getOpcode())
700  return SDValue();
701}
702
703void R600TargetLowering::ReplaceNodeResults(SDNode *N,
704                                            SmallVectorImpl<SDValue> &Results,
705                                            SelectionDAG &DAG) const {
706  switch (N->getOpcode()) {
707  default: return;
708  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
709    return;
710  case ISD::LOAD: {
711    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
712    Results.push_back(SDValue(Node, 0));
713    Results.push_back(SDValue(Node, 1));
714    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
715    // function
716    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
717    return;
718  }
719  case ISD::STORE:
720    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
721    Results.push_back(SDValue(Node, 0));
722    return;
723  }
724}
725
726SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
727  return DAG.getNode(
728      ISD::SETCC,
729      SDLoc(Op),
730      MVT::i1,
731      Op, DAG.getConstantFP(0.0f, MVT::f32),
732      DAG.getCondCode(ISD::SETNE)
733      );
734}
735
736SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
737                                                   SDLoc DL,
738                                                   unsigned DwordOffset) const {
739  unsigned ByteOffset = DwordOffset * 4;
740  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
741                                      AMDGPUAS::PARAM_I_ADDRESS);
742
743  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
744  assert(isInt<16>(ByteOffset));
745
746  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
747                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
748                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
749                     false, false, false, 0);
750}
751
752SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
753
754  MachineFunction &MF = DAG.getMachineFunction();
755  const AMDGPUFrameLowering *TFL =
756   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
757
758  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
759  assert(FIN);
760
761  unsigned FrameIndex = FIN->getIndex();
762  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
763  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
764}
765
766bool R600TargetLowering::isZero(SDValue Op) const {
767  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
768    return Cst->isNullValue();
769  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
770    return CstFP->isZero();
771  } else {
772    return false;
773  }
774}
775
776SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
777  SDLoc DL(Op);
778  EVT VT = Op.getValueType();
779
780  SDValue LHS = Op.getOperand(0);
781  SDValue RHS = Op.getOperand(1);
782  SDValue True = Op.getOperand(2);
783  SDValue False = Op.getOperand(3);
784  SDValue CC = Op.getOperand(4);
785  SDValue Temp;
786
787  // LHS and RHS are guaranteed to be the same value type
788  EVT CompareVT = LHS.getValueType();
789
790  // Check if we can lower this to a native operation.
791
792  // Try to lower to a SET* instruction:
793  //
794  // SET* can match the following patterns:
795  //
796  // select_cc f32, f32, -1,  0, cc_any
797  // select_cc f32, f32, 1.0f, 0.0f, cc_any
798  // select_cc i32, i32, -1,  0, cc_any
799  //
800
801  // Move hardware True/False values to the correct operand.
802  if (isHWTrueValue(False) && isHWFalseValue(True)) {
803    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
804    std::swap(False, True);
805    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
806  }
807
808  if (isHWTrueValue(True) && isHWFalseValue(False) &&
809      (CompareVT == VT || VT == MVT::i32)) {
810    // This can be matched by a SET* instruction.
811    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
812  }
813
814  // Try to lower to a CND* instruction:
815  //
816  // CND* can match the following patterns:
817  //
818  // select_cc f32, 0.0, f32, f32, cc_any
819  // select_cc f32, 0.0, i32, i32, cc_any
820  // select_cc i32, 0,   f32, f32, cc_any
821  // select_cc i32, 0,   i32, i32, cc_any
822  //
823  if (isZero(LHS) || isZero(RHS)) {
824    SDValue Cond = (isZero(LHS) ? RHS : LHS);
825    SDValue Zero = (isZero(LHS) ? LHS : RHS);
826    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
827    if (CompareVT != VT) {
828      // Bitcast True / False to the correct types.  This will end up being
829      // a nop, but it allows us to define only a single pattern in the
830      // .TD files for each CND* instruction rather than having to have
831      // one pattern for integer True/False and one for fp True/False
832      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
833      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
834    }
835    if (isZero(LHS)) {
836      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
837    }
838
839    switch (CCOpcode) {
840    case ISD::SETONE:
841    case ISD::SETUNE:
842    case ISD::SETNE:
843    case ISD::SETULE:
844    case ISD::SETULT:
845    case ISD::SETOLE:
846    case ISD::SETOLT:
847    case ISD::SETLE:
848    case ISD::SETLT:
849      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
850      Temp = True;
851      True = False;
852      False = Temp;
853      break;
854    default:
855      break;
856    }
857    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
858        Cond, Zero,
859        True, False,
860        DAG.getCondCode(CCOpcode));
861    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
862  }
863
864
865  // Possible Min/Max pattern
866  SDValue MinMax = LowerMinMax(Op, DAG);
867  if (MinMax.getNode()) {
868    return MinMax;
869  }
870
871  // If we make it this for it means we have no native instructions to handle
872  // this SELECT_CC, so we must lower it.
873  SDValue HWTrue, HWFalse;
874
875  if (CompareVT == MVT::f32) {
876    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
877    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
878  } else if (CompareVT == MVT::i32) {
879    HWTrue = DAG.getConstant(-1, CompareVT);
880    HWFalse = DAG.getConstant(0, CompareVT);
881  }
882  else {
883    assert(!"Unhandled value type in LowerSELECT_CC");
884  }
885
886  // Lower this unsupported SELECT_CC into a combination of two supported
887  // SELECT_CC operations.
888  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
889
890  return DAG.getNode(ISD::SELECT_CC, DL, VT,
891      Cond, HWFalse,
892      True, False,
893      DAG.getCondCode(ISD::SETNE));
894}
895
896SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
897  return DAG.getNode(ISD::SELECT_CC,
898      SDLoc(Op),
899      Op.getValueType(),
900      Op.getOperand(0),
901      DAG.getConstant(0, MVT::i32),
902      Op.getOperand(1),
903      Op.getOperand(2),
904      DAG.getCondCode(ISD::SETNE));
905}
906
907/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
908/// convert these pointers to a register index.  Each register holds
909/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
910/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
911/// for indirect addressing.
912SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
913                                               unsigned StackWidth,
914                                               SelectionDAG &DAG) const {
915  unsigned SRLPad;
916  switch(StackWidth) {
917  case 1:
918    SRLPad = 2;
919    break;
920  case 2:
921    SRLPad = 3;
922    break;
923  case 4:
924    SRLPad = 4;
925    break;
926  default: llvm_unreachable("Invalid stack width");
927  }
928
929  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
930                     DAG.getConstant(SRLPad, MVT::i32));
931}
932
933void R600TargetLowering::getStackAddress(unsigned StackWidth,
934                                         unsigned ElemIdx,
935                                         unsigned &Channel,
936                                         unsigned &PtrIncr) const {
937  switch (StackWidth) {
938  default:
939  case 1:
940    Channel = 0;
941    if (ElemIdx > 0) {
942      PtrIncr = 1;
943    } else {
944      PtrIncr = 0;
945    }
946    break;
947  case 2:
948    Channel = ElemIdx % 2;
949    if (ElemIdx == 2) {
950      PtrIncr = 1;
951    } else {
952      PtrIncr = 0;
953    }
954    break;
955  case 4:
956    Channel = ElemIdx;
957    PtrIncr = 0;
958    break;
959  }
960}
961
962SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
963  SDLoc DL(Op);
964  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
965  SDValue Chain = Op.getOperand(0);
966  SDValue Value = Op.getOperand(1);
967  SDValue Ptr = Op.getOperand(2);
968
969  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
970      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
971    // Convert pointer from byte address to dword address.
972    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
973                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
974                                  Ptr, DAG.getConstant(2, MVT::i32)));
975
976    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
977      assert(!"Truncated and indexed stores not supported yet");
978    } else {
979      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
980    }
981    return Chain;
982  }
983
984  EVT ValueVT = Value.getValueType();
985
986  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
987    return SDValue();
988  }
989
990  // Lowering for indirect addressing
991
992  const MachineFunction &MF = DAG.getMachineFunction();
993  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
994                                         getTargetMachine().getFrameLowering());
995  unsigned StackWidth = TFL->getStackWidth(MF);
996
997  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
998
999  if (ValueVT.isVector()) {
1000    unsigned NumElemVT = ValueVT.getVectorNumElements();
1001    EVT ElemVT = ValueVT.getVectorElementType();
1002    SDValue Stores[4];
1003
1004    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1005                                      "vector width in load");
1006
1007    for (unsigned i = 0; i < NumElemVT; ++i) {
1008      unsigned Channel, PtrIncr;
1009      getStackAddress(StackWidth, i, Channel, PtrIncr);
1010      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1011                        DAG.getConstant(PtrIncr, MVT::i32));
1012      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1013                                 Value, DAG.getConstant(i, MVT::i32));
1014
1015      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1016                              Chain, Elem, Ptr,
1017                              DAG.getTargetConstant(Channel, MVT::i32));
1018    }
1019     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1020   } else {
1021    if (ValueVT == MVT::i8) {
1022      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1023    }
1024    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1025    DAG.getTargetConstant(0, MVT::i32)); // Channel
1026  }
1027
1028  return Chain;
1029}
1030
1031// return (512 + (kc_bank << 12)
1032static int
1033ConstantAddressBlock(unsigned AddressSpace) {
1034  switch (AddressSpace) {
1035  case AMDGPUAS::CONSTANT_BUFFER_0:
1036    return 512;
1037  case AMDGPUAS::CONSTANT_BUFFER_1:
1038    return 512 + 4096;
1039  case AMDGPUAS::CONSTANT_BUFFER_2:
1040    return 512 + 4096 * 2;
1041  case AMDGPUAS::CONSTANT_BUFFER_3:
1042    return 512 + 4096 * 3;
1043  case AMDGPUAS::CONSTANT_BUFFER_4:
1044    return 512 + 4096 * 4;
1045  case AMDGPUAS::CONSTANT_BUFFER_5:
1046    return 512 + 4096 * 5;
1047  case AMDGPUAS::CONSTANT_BUFFER_6:
1048    return 512 + 4096 * 6;
1049  case AMDGPUAS::CONSTANT_BUFFER_7:
1050    return 512 + 4096 * 7;
1051  case AMDGPUAS::CONSTANT_BUFFER_8:
1052    return 512 + 4096 * 8;
1053  case AMDGPUAS::CONSTANT_BUFFER_9:
1054    return 512 + 4096 * 9;
1055  case AMDGPUAS::CONSTANT_BUFFER_10:
1056    return 512 + 4096 * 10;
1057  case AMDGPUAS::CONSTANT_BUFFER_11:
1058    return 512 + 4096 * 11;
1059  case AMDGPUAS::CONSTANT_BUFFER_12:
1060    return 512 + 4096 * 12;
1061  case AMDGPUAS::CONSTANT_BUFFER_13:
1062    return 512 + 4096 * 13;
1063  case AMDGPUAS::CONSTANT_BUFFER_14:
1064    return 512 + 4096 * 14;
1065  case AMDGPUAS::CONSTANT_BUFFER_15:
1066    return 512 + 4096 * 15;
1067  default:
1068    return -1;
1069  }
1070}
1071
1072SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1073{
1074  EVT VT = Op.getValueType();
1075  SDLoc DL(Op);
1076  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1077  SDValue Chain = Op.getOperand(0);
1078  SDValue Ptr = Op.getOperand(1);
1079  SDValue LoweredLoad;
1080
1081  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1082  if (ConstantBlock > -1) {
1083    SDValue Result;
1084    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1085        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1086        dyn_cast<ConstantSDNode>(Ptr)) {
1087      SDValue Slots[4];
1088      for (unsigned i = 0; i < 4; i++) {
1089        // We want Const position encoded with the following formula :
1090        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1091        // const_index is Ptr computed by llvm using an alignment of 16.
1092        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1093        // then div by 4 at the ISel step
1094        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1095            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1096        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1097      }
1098      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1099    } else {
1100      // non constant ptr cant be folded, keeps it as a v4f32 load
1101      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1102          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1103          DAG.getConstant(LoadNode->getAddressSpace() -
1104                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1105          );
1106    }
1107
1108    if (!VT.isVector()) {
1109      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1110          DAG.getConstant(0, MVT::i32));
1111    }
1112
1113    SDValue MergedValues[2] = {
1114        Result,
1115        Chain
1116    };
1117    return DAG.getMergeValues(MergedValues, 2, DL);
1118  }
1119
1120  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1121    return SDValue();
1122  }
1123
1124  // Lowering for indirect addressing
1125  const MachineFunction &MF = DAG.getMachineFunction();
1126  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1127                                         getTargetMachine().getFrameLowering());
1128  unsigned StackWidth = TFL->getStackWidth(MF);
1129
1130  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1131
1132  if (VT.isVector()) {
1133    unsigned NumElemVT = VT.getVectorNumElements();
1134    EVT ElemVT = VT.getVectorElementType();
1135    SDValue Loads[4];
1136
1137    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1138                                      "vector width in load");
1139
1140    for (unsigned i = 0; i < NumElemVT; ++i) {
1141      unsigned Channel, PtrIncr;
1142      getStackAddress(StackWidth, i, Channel, PtrIncr);
1143      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1144                        DAG.getConstant(PtrIncr, MVT::i32));
1145      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1146                             Chain, Ptr,
1147                             DAG.getTargetConstant(Channel, MVT::i32),
1148                             Op.getOperand(2));
1149    }
1150    for (unsigned i = NumElemVT; i < 4; ++i) {
1151      Loads[i] = DAG.getUNDEF(ElemVT);
1152    }
1153    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1154    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1155  } else {
1156    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1157                              Chain, Ptr,
1158                              DAG.getTargetConstant(0, MVT::i32), // Channel
1159                              Op.getOperand(2));
1160  }
1161
1162  SDValue Ops[2];
1163  Ops[0] = LoweredLoad;
1164  Ops[1] = Chain;
1165
1166  return DAG.getMergeValues(Ops, 2, DL);
1167}
1168
1169/// XXX Only kernel functions are supported, so we can assume for now that
1170/// every function is a kernel function, but in the future we should use
1171/// separate calling conventions for kernel and non-kernel functions.
1172SDValue R600TargetLowering::LowerFormalArguments(
1173                                      SDValue Chain,
1174                                      CallingConv::ID CallConv,
1175                                      bool isVarArg,
1176                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1177                                      SDLoc DL, SelectionDAG &DAG,
1178                                      SmallVectorImpl<SDValue> &InVals) const {
1179  unsigned ParamOffsetBytes = 36;
1180  Function::const_arg_iterator FuncArg =
1181                            DAG.getMachineFunction().getFunction()->arg_begin();
1182  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1183    EVT VT = Ins[i].VT;
1184    Type *ArgType = FuncArg->getType();
1185    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1186                             32 : ArgType->getPrimitiveSizeInBits();
1187    unsigned ArgBytes = ArgSizeInBits >> 3;
1188    EVT ArgVT;
1189    if (ArgSizeInBits < VT.getSizeInBits()) {
1190      assert(!ArgType->isFloatTy() &&
1191             "Extending floating point arguments not supported yet");
1192      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1193    } else {
1194      ArgVT = VT;
1195    }
1196    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1197                                                    AMDGPUAS::PARAM_I_ADDRESS);
1198    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1199                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
1200                                       MachinePointerInfo(UndefValue::get(PtrTy)),
1201                                       ArgVT, false, false, ArgBytes);
1202    InVals.push_back(Arg);
1203    ParamOffsetBytes += ArgBytes;
1204  }
1205  return Chain;
1206}
1207
1208EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1209   if (!VT.isVector()) return MVT::i32;
1210   return VT.changeVectorElementTypeToInteger();
1211}
1212
1213//===----------------------------------------------------------------------===//
1214// Custom DAG Optimizations
1215//===----------------------------------------------------------------------===//
1216
1217SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1218                                              DAGCombinerInfo &DCI) const {
1219  SelectionDAG &DAG = DCI.DAG;
1220
1221  switch (N->getOpcode()) {
1222  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1223  case ISD::FP_ROUND: {
1224      SDValue Arg = N->getOperand(0);
1225      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1226        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1227                           Arg.getOperand(0));
1228      }
1229      break;
1230    }
1231
1232  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1233  // (i32 select_cc f32, f32, -1, 0 cc)
1234  //
1235  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1236  // this to one of the SET*_DX10 instructions.
1237  case ISD::FP_TO_SINT: {
1238    SDValue FNeg = N->getOperand(0);
1239    if (FNeg.getOpcode() != ISD::FNEG) {
1240      return SDValue();
1241    }
1242    SDValue SelectCC = FNeg.getOperand(0);
1243    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1244        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1245        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1246        !isHWTrueValue(SelectCC.getOperand(2)) ||
1247        !isHWFalseValue(SelectCC.getOperand(3))) {
1248      return SDValue();
1249    }
1250
1251    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1252                           SelectCC.getOperand(0), // LHS
1253                           SelectCC.getOperand(1), // RHS
1254                           DAG.getConstant(-1, MVT::i32), // True
1255                           DAG.getConstant(0, MVT::i32),  // Flase
1256                           SelectCC.getOperand(4)); // CC
1257
1258    break;
1259  }
1260  // Extract_vec (Build_vector) generated by custom lowering
1261  // also needs to be customly combined
1262  case ISD::EXTRACT_VECTOR_ELT: {
1263    SDValue Arg = N->getOperand(0);
1264    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1265      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1266        unsigned Element = Const->getZExtValue();
1267        return Arg->getOperand(Element);
1268      }
1269    }
1270    if (Arg.getOpcode() == ISD::BITCAST &&
1271        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1272      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1273        unsigned Element = Const->getZExtValue();
1274        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1275            Arg->getOperand(0).getOperand(Element));
1276      }
1277    }
1278  }
1279
1280  case ISD::SELECT_CC: {
1281    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1282    //      selectcc x, y, a, b, inv(cc)
1283    //
1284    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1285    //      selectcc x, y, a, b, cc
1286    SDValue LHS = N->getOperand(0);
1287    if (LHS.getOpcode() != ISD::SELECT_CC) {
1288      return SDValue();
1289    }
1290
1291    SDValue RHS = N->getOperand(1);
1292    SDValue True = N->getOperand(2);
1293    SDValue False = N->getOperand(3);
1294    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1295
1296    if (LHS.getOperand(2).getNode() != True.getNode() ||
1297        LHS.getOperand(3).getNode() != False.getNode() ||
1298        RHS.getNode() != False.getNode()) {
1299      return SDValue();
1300    }
1301
1302    switch (NCC) {
1303    default: return SDValue();
1304    case ISD::SETNE: return LHS;
1305    case ISD::SETEQ: {
1306      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1307      LHSCC = ISD::getSetCCInverse(LHSCC,
1308                                  LHS.getOperand(0).getValueType().isInteger());
1309      return DAG.getSelectCC(SDLoc(N),
1310                             LHS.getOperand(0),
1311                             LHS.getOperand(1),
1312                             LHS.getOperand(2),
1313                             LHS.getOperand(3),
1314                             LHSCC);
1315    }
1316    }
1317  }
1318  case AMDGPUISD::EXPORT: {
1319    SDValue Arg = N->getOperand(1);
1320    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1321      break;
1322    SDValue NewBldVec[4] = {
1323        DAG.getUNDEF(MVT::f32),
1324        DAG.getUNDEF(MVT::f32),
1325        DAG.getUNDEF(MVT::f32),
1326        DAG.getUNDEF(MVT::f32)
1327      };
1328    SDValue NewArgs[8] = {
1329      N->getOperand(0), // Chain
1330      SDValue(),
1331      N->getOperand(2), // ArrayBase
1332      N->getOperand(3), // Type
1333      N->getOperand(4), // SWZ_X
1334      N->getOperand(5), // SWZ_Y
1335      N->getOperand(6), // SWZ_Z
1336      N->getOperand(7) // SWZ_W
1337    };
1338    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1339      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1340        if (C->isZero()) {
1341          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1342        } else if (C->isExactlyValue(1.0)) {
1343          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1344        } else {
1345          NewBldVec[i] = Arg.getOperand(i);
1346        }
1347      } else {
1348        NewBldVec[i] = Arg.getOperand(i);
1349      }
1350    }
1351    SDLoc DL(N);
1352    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1353    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1354  }
1355  }
1356  return SDValue();
1357}
1358