1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/CallingConvLower.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/SelectionDAG.h"
24#include "llvm/IR/Argument.h"
25#include "llvm/IR/Function.h"
26
27using namespace llvm;
28
29R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
30    AMDGPUTargetLowering(TM),
31    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
32  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
34  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
35  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
36  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
37  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
38
39  computeRegisterProperties();
40
41  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
42  setOperationAction(ISD::FADD, MVT::v2f32, Expand);
43  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
44  setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
45  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
46  setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
47  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
48  setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
49
50  setOperationAction(ISD::FCOS, MVT::f32, Custom);
51  setOperationAction(ISD::FSIN, MVT::f32, Custom);
52
53  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
54  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
55
56  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
57  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
58
59  setOperationAction(ISD::FSUB, MVT::f32, Expand);
60
61  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
62  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
63  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
64
65  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
66  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
67
68  setOperationAction(ISD::SETCC, MVT::i32, Expand);
69  setOperationAction(ISD::SETCC, MVT::f32, Expand);
70  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
71
72  setOperationAction(ISD::SELECT, MVT::i32, Custom);
73  setOperationAction(ISD::SELECT, MVT::f32, Custom);
74
75  // Legalize loads and stores to the private address space.
76  setOperationAction(ISD::LOAD, MVT::i32, Custom);
77  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
78  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
79  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
80  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
81  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
82  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
83  setOperationAction(ISD::STORE, MVT::i8, Custom);
84  setOperationAction(ISD::STORE, MVT::i32, Custom);
85  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
86  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
87
88  setOperationAction(ISD::LOAD, MVT::i32, Custom);
89  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
90  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
91
92  setTargetDAGCombine(ISD::FP_ROUND);
93  setTargetDAGCombine(ISD::FP_TO_SINT);
94  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
95  setTargetDAGCombine(ISD::SELECT_CC);
96  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
97
98  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
99
100  setBooleanContents(ZeroOrNegativeOneBooleanContent);
101  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
102  setSchedulingPreference(Sched::VLIW);
103}
104
105MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
106    MachineInstr * MI, MachineBasicBlock * BB) const {
107  MachineFunction * MF = BB->getParent();
108  MachineRegisterInfo &MRI = MF->getRegInfo();
109  MachineBasicBlock::iterator I = *MI;
110  const R600InstrInfo *TII =
111    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
112
113  switch (MI->getOpcode()) {
114  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
115  case AMDGPU::CLAMP_R600: {
116    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
117                                                   AMDGPU::MOV,
118                                                   MI->getOperand(0).getReg(),
119                                                   MI->getOperand(1).getReg());
120    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
121    break;
122  }
123
124  case AMDGPU::FABS_R600: {
125    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
126                                                    AMDGPU::MOV,
127                                                    MI->getOperand(0).getReg(),
128                                                    MI->getOperand(1).getReg());
129    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
130    break;
131  }
132
133  case AMDGPU::FNEG_R600: {
134    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
135                                                    AMDGPU::MOV,
136                                                    MI->getOperand(0).getReg(),
137                                                    MI->getOperand(1).getReg());
138    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
139    break;
140  }
141
142  case AMDGPU::MASK_WRITE: {
143    unsigned maskedRegister = MI->getOperand(0).getReg();
144    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
145    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
146    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
147    break;
148  }
149
150  case AMDGPU::LDS_READ_RET: {
151    MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
152                                        TII->get(MI->getOpcode()),
153                                        AMDGPU::OQAP);
154    for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
155      NewMI.addOperand(MI->getOperand(i));
156    }
157    TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
158                                 MI->getOperand(0).getReg(),
159                                 AMDGPU::OQAP);
160    break;
161  }
162
163  case AMDGPU::MOV_IMM_F32:
164    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
165                     MI->getOperand(1).getFPImm()->getValueAPF()
166                         .bitcastToAPInt().getZExtValue());
167    break;
168  case AMDGPU::MOV_IMM_I32:
169    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
170                     MI->getOperand(1).getImm());
171    break;
172  case AMDGPU::CONST_COPY: {
173    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
174        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
175    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
176        MI->getOperand(1).getImm());
177    break;
178  }
179
180  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
181  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
182  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
183    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
184
185    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
186            .addOperand(MI->getOperand(0))
187            .addOperand(MI->getOperand(1))
188            .addImm(EOP); // Set End of program bit
189    break;
190  }
191
192  case AMDGPU::TXD: {
193    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
194    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
195    MachineOperand &RID = MI->getOperand(4);
196    MachineOperand &SID = MI->getOperand(5);
197    unsigned TextureId = MI->getOperand(6).getImm();
198    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
199    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
200
201    switch (TextureId) {
202    case 5: // Rect
203      CTX = CTY = 0;
204      break;
205    case 6: // Shadow1D
206      SrcW = SrcZ;
207      break;
208    case 7: // Shadow2D
209      SrcW = SrcZ;
210      break;
211    case 8: // ShadowRect
212      CTX = CTY = 0;
213      SrcW = SrcZ;
214      break;
215    case 9: // 1DArray
216      SrcZ = SrcY;
217      CTZ = 0;
218      break;
219    case 10: // 2DArray
220      CTZ = 0;
221      break;
222    case 11: // Shadow1DArray
223      SrcZ = SrcY;
224      CTZ = 0;
225      break;
226    case 12: // Shadow2DArray
227      CTZ = 0;
228      break;
229    }
230    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
231            .addOperand(MI->getOperand(3))
232            .addImm(SrcX)
233            .addImm(SrcY)
234            .addImm(SrcZ)
235            .addImm(SrcW)
236            .addImm(0)
237            .addImm(0)
238            .addImm(0)
239            .addImm(0)
240            .addImm(1)
241            .addImm(2)
242            .addImm(3)
243            .addOperand(RID)
244            .addOperand(SID)
245            .addImm(CTX)
246            .addImm(CTY)
247            .addImm(CTZ)
248            .addImm(CTW);
249    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
250            .addOperand(MI->getOperand(2))
251            .addImm(SrcX)
252            .addImm(SrcY)
253            .addImm(SrcZ)
254            .addImm(SrcW)
255            .addImm(0)
256            .addImm(0)
257            .addImm(0)
258            .addImm(0)
259            .addImm(1)
260            .addImm(2)
261            .addImm(3)
262            .addOperand(RID)
263            .addOperand(SID)
264            .addImm(CTX)
265            .addImm(CTY)
266            .addImm(CTZ)
267            .addImm(CTW);
268    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
269            .addOperand(MI->getOperand(0))
270            .addOperand(MI->getOperand(1))
271            .addImm(SrcX)
272            .addImm(SrcY)
273            .addImm(SrcZ)
274            .addImm(SrcW)
275            .addImm(0)
276            .addImm(0)
277            .addImm(0)
278            .addImm(0)
279            .addImm(1)
280            .addImm(2)
281            .addImm(3)
282            .addOperand(RID)
283            .addOperand(SID)
284            .addImm(CTX)
285            .addImm(CTY)
286            .addImm(CTZ)
287            .addImm(CTW)
288            .addReg(T0, RegState::Implicit)
289            .addReg(T1, RegState::Implicit);
290    break;
291  }
292
293  case AMDGPU::TXD_SHADOW: {
294    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
295    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
296    MachineOperand &RID = MI->getOperand(4);
297    MachineOperand &SID = MI->getOperand(5);
298    unsigned TextureId = MI->getOperand(6).getImm();
299    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
300    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
301
302    switch (TextureId) {
303    case 5: // Rect
304      CTX = CTY = 0;
305      break;
306    case 6: // Shadow1D
307      SrcW = SrcZ;
308      break;
309    case 7: // Shadow2D
310      SrcW = SrcZ;
311      break;
312    case 8: // ShadowRect
313      CTX = CTY = 0;
314      SrcW = SrcZ;
315      break;
316    case 9: // 1DArray
317      SrcZ = SrcY;
318      CTZ = 0;
319      break;
320    case 10: // 2DArray
321      CTZ = 0;
322      break;
323    case 11: // Shadow1DArray
324      SrcZ = SrcY;
325      CTZ = 0;
326      break;
327    case 12: // Shadow2DArray
328      CTZ = 0;
329      break;
330    }
331
332    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
333            .addOperand(MI->getOperand(3))
334            .addImm(SrcX)
335            .addImm(SrcY)
336            .addImm(SrcZ)
337            .addImm(SrcW)
338            .addImm(0)
339            .addImm(0)
340            .addImm(0)
341            .addImm(0)
342            .addImm(1)
343            .addImm(2)
344            .addImm(3)
345            .addOperand(RID)
346            .addOperand(SID)
347            .addImm(CTX)
348            .addImm(CTY)
349            .addImm(CTZ)
350            .addImm(CTW);
351    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
352            .addOperand(MI->getOperand(2))
353            .addImm(SrcX)
354            .addImm(SrcY)
355            .addImm(SrcZ)
356            .addImm(SrcW)
357            .addImm(0)
358            .addImm(0)
359            .addImm(0)
360            .addImm(0)
361            .addImm(1)
362            .addImm(2)
363            .addImm(3)
364            .addOperand(RID)
365            .addOperand(SID)
366            .addImm(CTX)
367            .addImm(CTY)
368            .addImm(CTZ)
369            .addImm(CTW);
370    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
371            .addOperand(MI->getOperand(0))
372            .addOperand(MI->getOperand(1))
373            .addImm(SrcX)
374            .addImm(SrcY)
375            .addImm(SrcZ)
376            .addImm(SrcW)
377            .addImm(0)
378            .addImm(0)
379            .addImm(0)
380            .addImm(0)
381            .addImm(1)
382            .addImm(2)
383            .addImm(3)
384            .addOperand(RID)
385            .addOperand(SID)
386            .addImm(CTX)
387            .addImm(CTY)
388            .addImm(CTZ)
389            .addImm(CTW)
390            .addReg(T0, RegState::Implicit)
391            .addReg(T1, RegState::Implicit);
392    break;
393  }
394
395  case AMDGPU::BRANCH:
396      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
397              .addOperand(MI->getOperand(0));
398      break;
399
400  case AMDGPU::BRANCH_COND_f32: {
401    MachineInstr *NewMI =
402      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
403              AMDGPU::PREDICATE_BIT)
404              .addOperand(MI->getOperand(1))
405              .addImm(OPCODE_IS_NOT_ZERO)
406              .addImm(0); // Flags
407    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
408    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
409            .addOperand(MI->getOperand(0))
410            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
411    break;
412  }
413
414  case AMDGPU::BRANCH_COND_i32: {
415    MachineInstr *NewMI =
416      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
417            AMDGPU::PREDICATE_BIT)
418            .addOperand(MI->getOperand(1))
419            .addImm(OPCODE_IS_NOT_ZERO_INT)
420            .addImm(0); // Flags
421    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
422    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
423           .addOperand(MI->getOperand(0))
424            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
425    break;
426  }
427
428  case AMDGPU::EG_ExportSwz:
429  case AMDGPU::R600_ExportSwz: {
430    // Instruction is left unmodified if its not the last one of its type
431    bool isLastInstructionOfItsType = true;
432    unsigned InstExportType = MI->getOperand(1).getImm();
433    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
434         EndBlock = BB->end(); NextExportInst != EndBlock;
435         NextExportInst = llvm::next(NextExportInst)) {
436      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
437          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
438        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
439            .getImm();
440        if (CurrentInstExportType == InstExportType) {
441          isLastInstructionOfItsType = false;
442          break;
443        }
444      }
445    }
446    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
447    if (!EOP && !isLastInstructionOfItsType)
448      return BB;
449    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
450    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
451            .addOperand(MI->getOperand(0))
452            .addOperand(MI->getOperand(1))
453            .addOperand(MI->getOperand(2))
454            .addOperand(MI->getOperand(3))
455            .addOperand(MI->getOperand(4))
456            .addOperand(MI->getOperand(5))
457            .addOperand(MI->getOperand(6))
458            .addImm(CfInst)
459            .addImm(EOP);
460    break;
461  }
462  case AMDGPU::RETURN: {
463    // RETURN instructions must have the live-out registers as implicit uses,
464    // otherwise they appear dead.
465    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
466    MachineInstrBuilder MIB(*MF, MI);
467    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
468      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
469    return BB;
470  }
471  }
472
473  MI->eraseFromParent();
474  return BB;
475}
476
477//===----------------------------------------------------------------------===//
478// Custom DAG Lowering Operations
479//===----------------------------------------------------------------------===//
480
481SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
482  MachineFunction &MF = DAG.getMachineFunction();
483  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
484  switch (Op.getOpcode()) {
485  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
486  case ISD::FCOS:
487  case ISD::FSIN: return LowerTrig(Op, DAG);
488  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
489  case ISD::SELECT: return LowerSELECT(Op, DAG);
490  case ISD::STORE: return LowerSTORE(Op, DAG);
491  case ISD::LOAD: return LowerLOAD(Op, DAG);
492  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
493  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
494  case ISD::INTRINSIC_VOID: {
495    SDValue Chain = Op.getOperand(0);
496    unsigned IntrinsicID =
497                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
498    switch (IntrinsicID) {
499    case AMDGPUIntrinsic::AMDGPU_store_output: {
500      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
501      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
502      MFI->LiveOuts.push_back(Reg);
503      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
504    }
505    case AMDGPUIntrinsic::R600_store_swizzle: {
506      const SDValue Args[8] = {
507        Chain,
508        Op.getOperand(2), // Export Value
509        Op.getOperand(3), // ArrayBase
510        Op.getOperand(4), // Type
511        DAG.getConstant(0, MVT::i32), // SWZ_X
512        DAG.getConstant(1, MVT::i32), // SWZ_Y
513        DAG.getConstant(2, MVT::i32), // SWZ_Z
514        DAG.getConstant(3, MVT::i32) // SWZ_W
515      };
516      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
517          Args, 8);
518    }
519
520    // default for switch(IntrinsicID)
521    default: break;
522    }
523    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
524    break;
525  }
526  case ISD::INTRINSIC_WO_CHAIN: {
527    unsigned IntrinsicID =
528                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
529    EVT VT = Op.getValueType();
530    SDLoc DL(Op);
531    switch(IntrinsicID) {
532    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
533    case AMDGPUIntrinsic::R600_load_input: {
534      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
535      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
536      MachineFunction &MF = DAG.getMachineFunction();
537      MachineRegisterInfo &MRI = MF.getRegInfo();
538      MRI.addLiveIn(Reg);
539      return DAG.getCopyFromReg(DAG.getEntryNode(),
540          SDLoc(DAG.getEntryNode()), Reg, VT);
541    }
542
543    case AMDGPUIntrinsic::R600_interp_input: {
544      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
545      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
546      MachineSDNode *interp;
547      if (ijb < 0) {
548        const MachineFunction &MF = DAG.getMachineFunction();
549        const R600InstrInfo *TII =
550          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
551        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
552            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
553        return DAG.getTargetExtractSubreg(
554            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
555            DL, MVT::f32, SDValue(interp, 0));
556      }
557
558      MachineFunction &MF = DAG.getMachineFunction();
559      MachineRegisterInfo &MRI = MF.getRegInfo();
560      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
561      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
562      MRI.addLiveIn(RegisterI);
563      MRI.addLiveIn(RegisterJ);
564      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
565          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
566      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
567          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
568
569      if (slot % 4 < 2)
570        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
571            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
572            RegisterJNode, RegisterINode);
573      else
574        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
575            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
576            RegisterJNode, RegisterINode);
577      return SDValue(interp, slot % 2);
578    }
579    case AMDGPUIntrinsic::R600_tex:
580    case AMDGPUIntrinsic::R600_texc:
581    case AMDGPUIntrinsic::R600_txl:
582    case AMDGPUIntrinsic::R600_txlc:
583    case AMDGPUIntrinsic::R600_txb:
584    case AMDGPUIntrinsic::R600_txbc:
585    case AMDGPUIntrinsic::R600_txf:
586    case AMDGPUIntrinsic::R600_txq:
587    case AMDGPUIntrinsic::R600_ddx:
588    case AMDGPUIntrinsic::R600_ddy: {
589      unsigned TextureOp;
590      switch (IntrinsicID) {
591      case AMDGPUIntrinsic::R600_tex:
592        TextureOp = 0;
593        break;
594      case AMDGPUIntrinsic::R600_texc:
595        TextureOp = 1;
596        break;
597      case AMDGPUIntrinsic::R600_txl:
598        TextureOp = 2;
599        break;
600      case AMDGPUIntrinsic::R600_txlc:
601        TextureOp = 3;
602        break;
603      case AMDGPUIntrinsic::R600_txb:
604        TextureOp = 4;
605        break;
606      case AMDGPUIntrinsic::R600_txbc:
607        TextureOp = 5;
608        break;
609      case AMDGPUIntrinsic::R600_txf:
610        TextureOp = 6;
611        break;
612      case AMDGPUIntrinsic::R600_txq:
613        TextureOp = 7;
614        break;
615      case AMDGPUIntrinsic::R600_ddx:
616        TextureOp = 8;
617        break;
618      case AMDGPUIntrinsic::R600_ddy:
619        TextureOp = 9;
620        break;
621      default:
622        llvm_unreachable("Unknow Texture Operation");
623      }
624
625      SDValue TexArgs[19] = {
626        DAG.getConstant(TextureOp, MVT::i32),
627        Op.getOperand(1),
628        DAG.getConstant(0, MVT::i32),
629        DAG.getConstant(1, MVT::i32),
630        DAG.getConstant(2, MVT::i32),
631        DAG.getConstant(3, MVT::i32),
632        Op.getOperand(2),
633        Op.getOperand(3),
634        Op.getOperand(4),
635        DAG.getConstant(0, MVT::i32),
636        DAG.getConstant(1, MVT::i32),
637        DAG.getConstant(2, MVT::i32),
638        DAG.getConstant(3, MVT::i32),
639        Op.getOperand(5),
640        Op.getOperand(6),
641        Op.getOperand(7),
642        Op.getOperand(8),
643        Op.getOperand(9),
644        Op.getOperand(10)
645      };
646      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
647    }
648    case AMDGPUIntrinsic::AMDGPU_dp4: {
649      SDValue Args[8] = {
650      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
651          DAG.getConstant(0, MVT::i32)),
652      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
653          DAG.getConstant(0, MVT::i32)),
654      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
655          DAG.getConstant(1, MVT::i32)),
656      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
657          DAG.getConstant(1, MVT::i32)),
658      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
659          DAG.getConstant(2, MVT::i32)),
660      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
661          DAG.getConstant(2, MVT::i32)),
662      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
663          DAG.getConstant(3, MVT::i32)),
664      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
665          DAG.getConstant(3, MVT::i32))
666      };
667      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
668    }
669
670    case Intrinsic::r600_read_ngroups_x:
671      return LowerImplicitParameter(DAG, VT, DL, 0);
672    case Intrinsic::r600_read_ngroups_y:
673      return LowerImplicitParameter(DAG, VT, DL, 1);
674    case Intrinsic::r600_read_ngroups_z:
675      return LowerImplicitParameter(DAG, VT, DL, 2);
676    case Intrinsic::r600_read_global_size_x:
677      return LowerImplicitParameter(DAG, VT, DL, 3);
678    case Intrinsic::r600_read_global_size_y:
679      return LowerImplicitParameter(DAG, VT, DL, 4);
680    case Intrinsic::r600_read_global_size_z:
681      return LowerImplicitParameter(DAG, VT, DL, 5);
682    case Intrinsic::r600_read_local_size_x:
683      return LowerImplicitParameter(DAG, VT, DL, 6);
684    case Intrinsic::r600_read_local_size_y:
685      return LowerImplicitParameter(DAG, VT, DL, 7);
686    case Intrinsic::r600_read_local_size_z:
687      return LowerImplicitParameter(DAG, VT, DL, 8);
688
689    case Intrinsic::r600_read_tgid_x:
690      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
691                                  AMDGPU::T1_X, VT);
692    case Intrinsic::r600_read_tgid_y:
693      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
694                                  AMDGPU::T1_Y, VT);
695    case Intrinsic::r600_read_tgid_z:
696      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
697                                  AMDGPU::T1_Z, VT);
698    case Intrinsic::r600_read_tidig_x:
699      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
700                                  AMDGPU::T0_X, VT);
701    case Intrinsic::r600_read_tidig_y:
702      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
703                                  AMDGPU::T0_Y, VT);
704    case Intrinsic::r600_read_tidig_z:
705      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
706                                  AMDGPU::T0_Z, VT);
707    }
708    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
709    break;
710  }
711  } // end switch(Op.getOpcode())
712  return SDValue();
713}
714
715void R600TargetLowering::ReplaceNodeResults(SDNode *N,
716                                            SmallVectorImpl<SDValue> &Results,
717                                            SelectionDAG &DAG) const {
718  switch (N->getOpcode()) {
719  default: return;
720  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
721    return;
722  case ISD::LOAD: {
723    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
724    Results.push_back(SDValue(Node, 0));
725    Results.push_back(SDValue(Node, 1));
726    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
727    // function
728    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
729    return;
730  }
731  case ISD::STORE:
732    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
733    Results.push_back(SDValue(Node, 0));
734    return;
735  }
736}
737
738SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
739  // On hw >= R700, COS/SIN input must be between -1. and 1.
740  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
741  EVT VT = Op.getValueType();
742  SDValue Arg = Op.getOperand(0);
743  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
744      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
745        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
746          DAG.getConstantFP(0.15915494309, MVT::f32)),
747        DAG.getConstantFP(0.5, MVT::f32)));
748  unsigned TrigNode;
749  switch (Op.getOpcode()) {
750  case ISD::FCOS:
751    TrigNode = AMDGPUISD::COS_HW;
752    break;
753  case ISD::FSIN:
754    TrigNode = AMDGPUISD::SIN_HW;
755    break;
756  default:
757    llvm_unreachable("Wrong trig opcode");
758  }
759  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
760      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
761        DAG.getConstantFP(-0.5, MVT::f32)));
762  if (Gen >= AMDGPUSubtarget::R700)
763    return TrigVal;
764  // On R600 hw, COS/SIN input must be between -Pi and Pi.
765  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
766      DAG.getConstantFP(3.14159265359, MVT::f32));
767}
768
769SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
770  return DAG.getNode(
771      ISD::SETCC,
772      SDLoc(Op),
773      MVT::i1,
774      Op, DAG.getConstantFP(0.0f, MVT::f32),
775      DAG.getCondCode(ISD::SETNE)
776      );
777}
778
779SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
780                                                   SDLoc DL,
781                                                   unsigned DwordOffset) const {
782  unsigned ByteOffset = DwordOffset * 4;
783  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
784                                      AMDGPUAS::CONSTANT_BUFFER_0);
785
786  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
787  assert(isInt<16>(ByteOffset));
788
789  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
790                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
791                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
792                     false, false, false, 0);
793}
794
795SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
796
797  MachineFunction &MF = DAG.getMachineFunction();
798  const AMDGPUFrameLowering *TFL =
799   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
800
801  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
802  assert(FIN);
803
804  unsigned FrameIndex = FIN->getIndex();
805  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
806  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
807}
808
809bool R600TargetLowering::isZero(SDValue Op) const {
810  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
811    return Cst->isNullValue();
812  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
813    return CstFP->isZero();
814  } else {
815    return false;
816  }
817}
818
819SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
820  SDLoc DL(Op);
821  EVT VT = Op.getValueType();
822
823  SDValue LHS = Op.getOperand(0);
824  SDValue RHS = Op.getOperand(1);
825  SDValue True = Op.getOperand(2);
826  SDValue False = Op.getOperand(3);
827  SDValue CC = Op.getOperand(4);
828  SDValue Temp;
829
830  // LHS and RHS are guaranteed to be the same value type
831  EVT CompareVT = LHS.getValueType();
832
833  // Check if we can lower this to a native operation.
834
835  // Try to lower to a SET* instruction:
836  //
837  // SET* can match the following patterns:
838  //
839  // select_cc f32, f32, -1,  0, cc_any
840  // select_cc f32, f32, 1.0f, 0.0f, cc_any
841  // select_cc i32, i32, -1,  0, cc_any
842  //
843
844  // Move hardware True/False values to the correct operand.
845  if (isHWTrueValue(False) && isHWFalseValue(True)) {
846    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
847    std::swap(False, True);
848    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
849  }
850
851  if (isHWTrueValue(True) && isHWFalseValue(False) &&
852      (CompareVT == VT || VT == MVT::i32)) {
853    // This can be matched by a SET* instruction.
854    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
855  }
856
857  // Try to lower to a CND* instruction:
858  //
859  // CND* can match the following patterns:
860  //
861  // select_cc f32, 0.0, f32, f32, cc_any
862  // select_cc f32, 0.0, i32, i32, cc_any
863  // select_cc i32, 0,   f32, f32, cc_any
864  // select_cc i32, 0,   i32, i32, cc_any
865  //
866  if (isZero(LHS) || isZero(RHS)) {
867    SDValue Cond = (isZero(LHS) ? RHS : LHS);
868    SDValue Zero = (isZero(LHS) ? LHS : RHS);
869    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
870    if (CompareVT != VT) {
871      // Bitcast True / False to the correct types.  This will end up being
872      // a nop, but it allows us to define only a single pattern in the
873      // .TD files for each CND* instruction rather than having to have
874      // one pattern for integer True/False and one for fp True/False
875      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
876      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
877    }
878    if (isZero(LHS)) {
879      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
880    }
881
882    switch (CCOpcode) {
883    case ISD::SETONE:
884    case ISD::SETUNE:
885    case ISD::SETNE:
886    case ISD::SETULE:
887    case ISD::SETULT:
888    case ISD::SETOLE:
889    case ISD::SETOLT:
890    case ISD::SETLE:
891    case ISD::SETLT:
892      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
893      Temp = True;
894      True = False;
895      False = Temp;
896      break;
897    default:
898      break;
899    }
900    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
901        Cond, Zero,
902        True, False,
903        DAG.getCondCode(CCOpcode));
904    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
905  }
906
907
908  // Possible Min/Max pattern
909  SDValue MinMax = LowerMinMax(Op, DAG);
910  if (MinMax.getNode()) {
911    return MinMax;
912  }
913
914  // If we make it this for it means we have no native instructions to handle
915  // this SELECT_CC, so we must lower it.
916  SDValue HWTrue, HWFalse;
917
918  if (CompareVT == MVT::f32) {
919    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
920    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
921  } else if (CompareVT == MVT::i32) {
922    HWTrue = DAG.getConstant(-1, CompareVT);
923    HWFalse = DAG.getConstant(0, CompareVT);
924  }
925  else {
926    assert(!"Unhandled value type in LowerSELECT_CC");
927  }
928
929  // Lower this unsupported SELECT_CC into a combination of two supported
930  // SELECT_CC operations.
931  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
932
933  return DAG.getNode(ISD::SELECT_CC, DL, VT,
934      Cond, HWFalse,
935      True, False,
936      DAG.getCondCode(ISD::SETNE));
937}
938
939SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
940  return DAG.getNode(ISD::SELECT_CC,
941      SDLoc(Op),
942      Op.getValueType(),
943      Op.getOperand(0),
944      DAG.getConstant(0, MVT::i32),
945      Op.getOperand(1),
946      Op.getOperand(2),
947      DAG.getCondCode(ISD::SETNE));
948}
949
950/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
951/// convert these pointers to a register index.  Each register holds
952/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
953/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
954/// for indirect addressing.
955SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
956                                               unsigned StackWidth,
957                                               SelectionDAG &DAG) const {
958  unsigned SRLPad;
959  switch(StackWidth) {
960  case 1:
961    SRLPad = 2;
962    break;
963  case 2:
964    SRLPad = 3;
965    break;
966  case 4:
967    SRLPad = 4;
968    break;
969  default: llvm_unreachable("Invalid stack width");
970  }
971
972  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
973                     DAG.getConstant(SRLPad, MVT::i32));
974}
975
976void R600TargetLowering::getStackAddress(unsigned StackWidth,
977                                         unsigned ElemIdx,
978                                         unsigned &Channel,
979                                         unsigned &PtrIncr) const {
980  switch (StackWidth) {
981  default:
982  case 1:
983    Channel = 0;
984    if (ElemIdx > 0) {
985      PtrIncr = 1;
986    } else {
987      PtrIncr = 0;
988    }
989    break;
990  case 2:
991    Channel = ElemIdx % 2;
992    if (ElemIdx == 2) {
993      PtrIncr = 1;
994    } else {
995      PtrIncr = 0;
996    }
997    break;
998  case 4:
999    Channel = ElemIdx;
1000    PtrIncr = 0;
1001    break;
1002  }
1003}
1004
1005SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1006  SDLoc DL(Op);
1007  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1008  SDValue Chain = Op.getOperand(0);
1009  SDValue Value = Op.getOperand(1);
1010  SDValue Ptr = Op.getOperand(2);
1011
1012  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1013      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
1014    // Convert pointer from byte address to dword address.
1015    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1016                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1017                                  Ptr, DAG.getConstant(2, MVT::i32)));
1018
1019    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1020      assert(!"Truncated and indexed stores not supported yet");
1021    } else {
1022      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1023    }
1024    return Chain;
1025  }
1026
1027  EVT ValueVT = Value.getValueType();
1028
1029  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1030    return SDValue();
1031  }
1032
1033  // Lowering for indirect addressing
1034
1035  const MachineFunction &MF = DAG.getMachineFunction();
1036  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1037                                         getTargetMachine().getFrameLowering());
1038  unsigned StackWidth = TFL->getStackWidth(MF);
1039
1040  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1041
1042  if (ValueVT.isVector()) {
1043    unsigned NumElemVT = ValueVT.getVectorNumElements();
1044    EVT ElemVT = ValueVT.getVectorElementType();
1045    SDValue Stores[4];
1046
1047    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1048                                      "vector width in load");
1049
1050    for (unsigned i = 0; i < NumElemVT; ++i) {
1051      unsigned Channel, PtrIncr;
1052      getStackAddress(StackWidth, i, Channel, PtrIncr);
1053      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1054                        DAG.getConstant(PtrIncr, MVT::i32));
1055      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1056                                 Value, DAG.getConstant(i, MVT::i32));
1057
1058      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1059                              Chain, Elem, Ptr,
1060                              DAG.getTargetConstant(Channel, MVT::i32));
1061    }
1062     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1063   } else {
1064    if (ValueVT == MVT::i8) {
1065      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1066    }
1067    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1068    DAG.getTargetConstant(0, MVT::i32)); // Channel
1069  }
1070
1071  return Chain;
1072}
1073
1074// return (512 + (kc_bank << 12)
1075static int
1076ConstantAddressBlock(unsigned AddressSpace) {
1077  switch (AddressSpace) {
1078  case AMDGPUAS::CONSTANT_BUFFER_0:
1079    return 512;
1080  case AMDGPUAS::CONSTANT_BUFFER_1:
1081    return 512 + 4096;
1082  case AMDGPUAS::CONSTANT_BUFFER_2:
1083    return 512 + 4096 * 2;
1084  case AMDGPUAS::CONSTANT_BUFFER_3:
1085    return 512 + 4096 * 3;
1086  case AMDGPUAS::CONSTANT_BUFFER_4:
1087    return 512 + 4096 * 4;
1088  case AMDGPUAS::CONSTANT_BUFFER_5:
1089    return 512 + 4096 * 5;
1090  case AMDGPUAS::CONSTANT_BUFFER_6:
1091    return 512 + 4096 * 6;
1092  case AMDGPUAS::CONSTANT_BUFFER_7:
1093    return 512 + 4096 * 7;
1094  case AMDGPUAS::CONSTANT_BUFFER_8:
1095    return 512 + 4096 * 8;
1096  case AMDGPUAS::CONSTANT_BUFFER_9:
1097    return 512 + 4096 * 9;
1098  case AMDGPUAS::CONSTANT_BUFFER_10:
1099    return 512 + 4096 * 10;
1100  case AMDGPUAS::CONSTANT_BUFFER_11:
1101    return 512 + 4096 * 11;
1102  case AMDGPUAS::CONSTANT_BUFFER_12:
1103    return 512 + 4096 * 12;
1104  case AMDGPUAS::CONSTANT_BUFFER_13:
1105    return 512 + 4096 * 13;
1106  case AMDGPUAS::CONSTANT_BUFFER_14:
1107    return 512 + 4096 * 14;
1108  case AMDGPUAS::CONSTANT_BUFFER_15:
1109    return 512 + 4096 * 15;
1110  default:
1111    return -1;
1112  }
1113}
1114
1115SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1116{
1117  EVT VT = Op.getValueType();
1118  SDLoc DL(Op);
1119  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1120  SDValue Chain = Op.getOperand(0);
1121  SDValue Ptr = Op.getOperand(1);
1122  SDValue LoweredLoad;
1123
1124  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1125  if (ConstantBlock > -1) {
1126    SDValue Result;
1127    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1128        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1129        dyn_cast<ConstantSDNode>(Ptr)) {
1130      SDValue Slots[4];
1131      for (unsigned i = 0; i < 4; i++) {
1132        // We want Const position encoded with the following formula :
1133        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1134        // const_index is Ptr computed by llvm using an alignment of 16.
1135        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1136        // then div by 4 at the ISel step
1137        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1138            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1139        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1140      }
1141      EVT NewVT = MVT::v4i32;
1142      unsigned NumElements = 4;
1143      if (VT.isVector()) {
1144        NewVT = VT;
1145        NumElements = VT.getVectorNumElements();
1146      }
1147      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1148    } else {
1149      // non constant ptr cant be folded, keeps it as a v4f32 load
1150      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1151          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1152          DAG.getConstant(LoadNode->getAddressSpace() -
1153                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1154          );
1155    }
1156
1157    if (!VT.isVector()) {
1158      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1159          DAG.getConstant(0, MVT::i32));
1160    }
1161
1162    SDValue MergedValues[2] = {
1163        Result,
1164        Chain
1165    };
1166    return DAG.getMergeValues(MergedValues, 2, DL);
1167  }
1168
1169  // For most operations returning SDValue() will result int he node being
1170  // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
1171  // we need to manually expand loads that may be legal in some address spaces
1172  // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
1173  // for compute shaders, since the data is sign extended when it is uploaded
1174  // to the buffer.  Howerver SEXT loads from other addresspaces are not
1175  // supported, so we need to expand them here.
1176  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1177    EVT MemVT = LoadNode->getMemoryVT();
1178    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1179    SDValue ShiftAmount =
1180          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1181    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1182                                  LoadNode->getPointerInfo(), MemVT,
1183                                  LoadNode->isVolatile(),
1184                                  LoadNode->isNonTemporal(),
1185                                  LoadNode->getAlignment());
1186    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1187    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1188
1189    SDValue MergedValues[2] = { Sra, Chain };
1190    return DAG.getMergeValues(MergedValues, 2, DL);
1191  }
1192
1193  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1194    return SDValue();
1195  }
1196
1197  // Lowering for indirect addressing
1198  const MachineFunction &MF = DAG.getMachineFunction();
1199  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1200                                         getTargetMachine().getFrameLowering());
1201  unsigned StackWidth = TFL->getStackWidth(MF);
1202
1203  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1204
1205  if (VT.isVector()) {
1206    unsigned NumElemVT = VT.getVectorNumElements();
1207    EVT ElemVT = VT.getVectorElementType();
1208    SDValue Loads[4];
1209
1210    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1211                                      "vector width in load");
1212
1213    for (unsigned i = 0; i < NumElemVT; ++i) {
1214      unsigned Channel, PtrIncr;
1215      getStackAddress(StackWidth, i, Channel, PtrIncr);
1216      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1217                        DAG.getConstant(PtrIncr, MVT::i32));
1218      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1219                             Chain, Ptr,
1220                             DAG.getTargetConstant(Channel, MVT::i32),
1221                             Op.getOperand(2));
1222    }
1223    for (unsigned i = NumElemVT; i < 4; ++i) {
1224      Loads[i] = DAG.getUNDEF(ElemVT);
1225    }
1226    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1227    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1228  } else {
1229    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1230                              Chain, Ptr,
1231                              DAG.getTargetConstant(0, MVT::i32), // Channel
1232                              Op.getOperand(2));
1233  }
1234
1235  SDValue Ops[2];
1236  Ops[0] = LoweredLoad;
1237  Ops[1] = Chain;
1238
1239  return DAG.getMergeValues(Ops, 2, DL);
1240}
1241
1242/// XXX Only kernel functions are supported, so we can assume for now that
1243/// every function is a kernel function, but in the future we should use
1244/// separate calling conventions for kernel and non-kernel functions.
1245SDValue R600TargetLowering::LowerFormalArguments(
1246                                      SDValue Chain,
1247                                      CallingConv::ID CallConv,
1248                                      bool isVarArg,
1249                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1250                                      SDLoc DL, SelectionDAG &DAG,
1251                                      SmallVectorImpl<SDValue> &InVals) const {
1252  SmallVector<CCValAssign, 16> ArgLocs;
1253  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1254                 getTargetMachine(), ArgLocs, *DAG.getContext());
1255
1256  AnalyzeFormalArguments(CCInfo, Ins);
1257
1258  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1259    CCValAssign &VA = ArgLocs[i];
1260    EVT VT = VA.getLocVT();
1261
1262    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1263                                                   AMDGPUAS::CONSTANT_BUFFER_0);
1264
1265    // The first 36 bytes of the input buffer contains information about
1266    // thread group and global sizes.
1267    SDValue Arg = DAG.getLoad(VT, DL, Chain,
1268                           DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1269                           MachinePointerInfo(UndefValue::get(PtrTy)), false,
1270                           false, false, 4); // 4 is the prefered alignment for
1271                                             // the CONSTANT memory space.
1272    InVals.push_back(Arg);
1273  }
1274  return Chain;
1275}
1276
1277EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1278   if (!VT.isVector()) return MVT::i32;
1279   return VT.changeVectorElementTypeToInteger();
1280}
1281
1282static SDValue
1283CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1284                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
1285  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1286  assert(RemapSwizzle.empty());
1287  SDValue NewBldVec[4] = {
1288      VectorEntry.getOperand(0),
1289      VectorEntry.getOperand(1),
1290      VectorEntry.getOperand(2),
1291      VectorEntry.getOperand(3)
1292  };
1293
1294  for (unsigned i = 0; i < 4; i++) {
1295    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1296      if (C->isZero()) {
1297        RemapSwizzle[i] = 4; // SEL_0
1298        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1299      } else if (C->isExactlyValue(1.0)) {
1300        RemapSwizzle[i] = 5; // SEL_1
1301        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1302      }
1303    }
1304
1305    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1306      continue;
1307    for (unsigned j = 0; j < i; j++) {
1308      if (NewBldVec[i] == NewBldVec[j]) {
1309        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1310        RemapSwizzle[i] = j;
1311        break;
1312      }
1313    }
1314  }
1315
1316  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1317      VectorEntry.getValueType(), NewBldVec, 4);
1318}
1319
1320static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1321                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1322  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1323  assert(RemapSwizzle.empty());
1324  SDValue NewBldVec[4] = {
1325      VectorEntry.getOperand(0),
1326      VectorEntry.getOperand(1),
1327      VectorEntry.getOperand(2),
1328      VectorEntry.getOperand(3)
1329  };
1330  bool isUnmovable[4] = { false, false, false, false };
1331  for (unsigned i = 0; i < 4; i++)
1332    RemapSwizzle[i] = i;
1333
1334  for (unsigned i = 0; i < 4; i++) {
1335    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1336      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1337          ->getZExtValue();
1338      if (!isUnmovable[Idx]) {
1339        // Swap i and Idx
1340        std::swap(NewBldVec[Idx], NewBldVec[i]);
1341        std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1342      }
1343      isUnmovable[Idx] = true;
1344    }
1345  }
1346
1347  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1348      VectorEntry.getValueType(), NewBldVec, 4);
1349}
1350
1351
1352SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1353SDValue Swz[4], SelectionDAG &DAG) const {
1354  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1355  // Old -> New swizzle values
1356  DenseMap<unsigned, unsigned> SwizzleRemap;
1357
1358  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1359  for (unsigned i = 0; i < 4; i++) {
1360    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1361    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1362      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1363  }
1364
1365  SwizzleRemap.clear();
1366  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1367  for (unsigned i = 0; i < 4; i++) {
1368    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1369    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1370      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1371  }
1372
1373  return BuildVector;
1374}
1375
1376
1377//===----------------------------------------------------------------------===//
1378// Custom DAG Optimizations
1379//===----------------------------------------------------------------------===//
1380
1381SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1382                                              DAGCombinerInfo &DCI) const {
1383  SelectionDAG &DAG = DCI.DAG;
1384
1385  switch (N->getOpcode()) {
1386  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1387  case ISD::FP_ROUND: {
1388      SDValue Arg = N->getOperand(0);
1389      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1390        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1391                           Arg.getOperand(0));
1392      }
1393      break;
1394    }
1395
1396  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1397  // (i32 select_cc f32, f32, -1, 0 cc)
1398  //
1399  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1400  // this to one of the SET*_DX10 instructions.
1401  case ISD::FP_TO_SINT: {
1402    SDValue FNeg = N->getOperand(0);
1403    if (FNeg.getOpcode() != ISD::FNEG) {
1404      return SDValue();
1405    }
1406    SDValue SelectCC = FNeg.getOperand(0);
1407    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1408        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1409        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1410        !isHWTrueValue(SelectCC.getOperand(2)) ||
1411        !isHWFalseValue(SelectCC.getOperand(3))) {
1412      return SDValue();
1413    }
1414
1415    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1416                           SelectCC.getOperand(0), // LHS
1417                           SelectCC.getOperand(1), // RHS
1418                           DAG.getConstant(-1, MVT::i32), // True
1419                           DAG.getConstant(0, MVT::i32),  // Flase
1420                           SelectCC.getOperand(4)); // CC
1421
1422    break;
1423  }
1424
1425  // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
1426  // => build_vector elt0, …, NewEltIdx, …, eltN
1427  case ISD::INSERT_VECTOR_ELT: {
1428    SDValue InVec = N->getOperand(0);
1429    SDValue InVal = N->getOperand(1);
1430    SDValue EltNo = N->getOperand(2);
1431    SDLoc dl(N);
1432
1433    // If the inserted element is an UNDEF, just use the input vector.
1434    if (InVal.getOpcode() == ISD::UNDEF)
1435      return InVec;
1436
1437    EVT VT = InVec.getValueType();
1438
1439    // If we can't generate a legal BUILD_VECTOR, exit
1440    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1441      return SDValue();
1442
1443    // Check that we know which element is being inserted
1444    if (!isa<ConstantSDNode>(EltNo))
1445      return SDValue();
1446    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1447
1448    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1449    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1450    // vector elements.
1451    SmallVector<SDValue, 8> Ops;
1452    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1453      Ops.append(InVec.getNode()->op_begin(),
1454                 InVec.getNode()->op_end());
1455    } else if (InVec.getOpcode() == ISD::UNDEF) {
1456      unsigned NElts = VT.getVectorNumElements();
1457      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1458    } else {
1459      return SDValue();
1460    }
1461
1462    // Insert the element
1463    if (Elt < Ops.size()) {
1464      // All the operands of BUILD_VECTOR must have the same type;
1465      // we enforce that here.
1466      EVT OpVT = Ops[0].getValueType();
1467      if (InVal.getValueType() != OpVT)
1468        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1469          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1470          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1471      Ops[Elt] = InVal;
1472    }
1473
1474    // Return the new vector
1475    return DAG.getNode(ISD::BUILD_VECTOR, dl,
1476                       VT, &Ops[0], Ops.size());
1477  }
1478
1479  // Extract_vec (Build_vector) generated by custom lowering
1480  // also needs to be customly combined
1481  case ISD::EXTRACT_VECTOR_ELT: {
1482    SDValue Arg = N->getOperand(0);
1483    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1484      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1485        unsigned Element = Const->getZExtValue();
1486        return Arg->getOperand(Element);
1487      }
1488    }
1489    if (Arg.getOpcode() == ISD::BITCAST &&
1490        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1491      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1492        unsigned Element = Const->getZExtValue();
1493        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1494            Arg->getOperand(0).getOperand(Element));
1495      }
1496    }
1497  }
1498
1499  case ISD::SELECT_CC: {
1500    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1501    //      selectcc x, y, a, b, inv(cc)
1502    //
1503    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1504    //      selectcc x, y, a, b, cc
1505    SDValue LHS = N->getOperand(0);
1506    if (LHS.getOpcode() != ISD::SELECT_CC) {
1507      return SDValue();
1508    }
1509
1510    SDValue RHS = N->getOperand(1);
1511    SDValue True = N->getOperand(2);
1512    SDValue False = N->getOperand(3);
1513    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1514
1515    if (LHS.getOperand(2).getNode() != True.getNode() ||
1516        LHS.getOperand(3).getNode() != False.getNode() ||
1517        RHS.getNode() != False.getNode()) {
1518      return SDValue();
1519    }
1520
1521    switch (NCC) {
1522    default: return SDValue();
1523    case ISD::SETNE: return LHS;
1524    case ISD::SETEQ: {
1525      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1526      LHSCC = ISD::getSetCCInverse(LHSCC,
1527                                  LHS.getOperand(0).getValueType().isInteger());
1528      return DAG.getSelectCC(SDLoc(N),
1529                             LHS.getOperand(0),
1530                             LHS.getOperand(1),
1531                             LHS.getOperand(2),
1532                             LHS.getOperand(3),
1533                             LHSCC);
1534    }
1535    }
1536  }
1537  case AMDGPUISD::EXPORT: {
1538    SDValue Arg = N->getOperand(1);
1539    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1540      break;
1541
1542    SDValue NewArgs[8] = {
1543      N->getOperand(0), // Chain
1544      SDValue(),
1545      N->getOperand(2), // ArrayBase
1546      N->getOperand(3), // Type
1547      N->getOperand(4), // SWZ_X
1548      N->getOperand(5), // SWZ_Y
1549      N->getOperand(6), // SWZ_Z
1550      N->getOperand(7) // SWZ_W
1551    };
1552    SDLoc DL(N);
1553    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1554    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1555  }
1556  case AMDGPUISD::TEXTURE_FETCH: {
1557    SDValue Arg = N->getOperand(1);
1558    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1559      break;
1560
1561    SDValue NewArgs[19] = {
1562      N->getOperand(0),
1563      N->getOperand(1),
1564      N->getOperand(2),
1565      N->getOperand(3),
1566      N->getOperand(4),
1567      N->getOperand(5),
1568      N->getOperand(6),
1569      N->getOperand(7),
1570      N->getOperand(8),
1571      N->getOperand(9),
1572      N->getOperand(10),
1573      N->getOperand(11),
1574      N->getOperand(12),
1575      N->getOperand(13),
1576      N->getOperand(14),
1577      N->getOperand(15),
1578      N->getOperand(16),
1579      N->getOperand(17),
1580      N->getOperand(18),
1581    };
1582    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1583    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1584        NewArgs, 19);
1585  }
1586  }
1587  return SDValue();
1588}
1589