R600ISelLowering.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/CallingConvLower.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/SelectionDAG.h"
24#include "llvm/IR/Argument.h"
25#include "llvm/IR/Function.h"
26
27using namespace llvm;
28
29R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
30    AMDGPUTargetLowering(TM),
31    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
32  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
34  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
35  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
36  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
37  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
38
39  computeRegisterProperties();
40
41  // Set condition code actions
42  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
43  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
44  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
45  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
46  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
47  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
48  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
49  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
50  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
54
55  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
56  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
57  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
58  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
59
60  setOperationAction(ISD::FCOS, MVT::f32, Custom);
61  setOperationAction(ISD::FSIN, MVT::f32, Custom);
62
63  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
64  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
65
66  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68
69  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70
71  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74
75  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
76  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
77
78  setOperationAction(ISD::SETCC, MVT::i32, Expand);
79  setOperationAction(ISD::SETCC, MVT::f32, Expand);
80  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
81
82  setOperationAction(ISD::SELECT, MVT::i32, Expand);
83  setOperationAction(ISD::SELECT, MVT::f32, Expand);
84  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
85  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
86  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
87  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
88
89  // Legalize loads and stores to the private address space.
90  setOperationAction(ISD::LOAD, MVT::i32, Custom);
91  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
92  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
93
94  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
95  // spaces, so it is custom lowered to handle those where it isn't.
96  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
97  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
98  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
99  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
100  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
101  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
102
103  setOperationAction(ISD::STORE, MVT::i8, Custom);
104  setOperationAction(ISD::STORE, MVT::i32, Custom);
105  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
106  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
107  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
108  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
109
110  setOperationAction(ISD::LOAD, MVT::i32, Custom);
111  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
112  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
113
114  setTargetDAGCombine(ISD::FP_ROUND);
115  setTargetDAGCombine(ISD::FP_TO_SINT);
116  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
117  setTargetDAGCombine(ISD::SELECT_CC);
118  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
119
120  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
121
122  setBooleanContents(ZeroOrNegativeOneBooleanContent);
123  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
124  setSchedulingPreference(Sched::Source);
125}
126
127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
128    MachineInstr * MI, MachineBasicBlock * BB) const {
129  MachineFunction * MF = BB->getParent();
130  MachineRegisterInfo &MRI = MF->getRegInfo();
131  MachineBasicBlock::iterator I = *MI;
132  const R600InstrInfo *TII =
133    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
134
135  switch (MI->getOpcode()) {
136  default:
137    // Replace LDS_*_RET instruction that don't have any uses with the
138    // equivalent LDS_*_NORET instruction.
139    if (TII->isLDSRetInstr(MI->getOpcode())) {
140      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
141      assert(DstIdx != -1);
142      MachineInstrBuilder NewMI;
143      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
144        return BB;
145
146      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
147                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
148      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
149        NewMI.addOperand(MI->getOperand(i));
150      }
151    } else {
152      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
153    }
154    break;
155  case AMDGPU::CLAMP_R600: {
156    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
157                                                   AMDGPU::MOV,
158                                                   MI->getOperand(0).getReg(),
159                                                   MI->getOperand(1).getReg());
160    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
161    break;
162  }
163
164  case AMDGPU::FABS_R600: {
165    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
166                                                    AMDGPU::MOV,
167                                                    MI->getOperand(0).getReg(),
168                                                    MI->getOperand(1).getReg());
169    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
170    break;
171  }
172
173  case AMDGPU::FNEG_R600: {
174    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
175                                                    AMDGPU::MOV,
176                                                    MI->getOperand(0).getReg(),
177                                                    MI->getOperand(1).getReg());
178    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
179    break;
180  }
181
182  case AMDGPU::MASK_WRITE: {
183    unsigned maskedRegister = MI->getOperand(0).getReg();
184    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
185    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
186    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
187    break;
188  }
189
190  case AMDGPU::MOV_IMM_F32:
191    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
192                     MI->getOperand(1).getFPImm()->getValueAPF()
193                         .bitcastToAPInt().getZExtValue());
194    break;
195  case AMDGPU::MOV_IMM_I32:
196    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
197                     MI->getOperand(1).getImm());
198    break;
199  case AMDGPU::CONST_COPY: {
200    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
201        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
202    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
203        MI->getOperand(1).getImm());
204    break;
205  }
206
207  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
208  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
209  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
210    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
211
212    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
213            .addOperand(MI->getOperand(0))
214            .addOperand(MI->getOperand(1))
215            .addImm(EOP); // Set End of program bit
216    break;
217  }
218
219  case AMDGPU::TXD: {
220    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
221    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
222    MachineOperand &RID = MI->getOperand(4);
223    MachineOperand &SID = MI->getOperand(5);
224    unsigned TextureId = MI->getOperand(6).getImm();
225    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
226    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
227
228    switch (TextureId) {
229    case 5: // Rect
230      CTX = CTY = 0;
231      break;
232    case 6: // Shadow1D
233      SrcW = SrcZ;
234      break;
235    case 7: // Shadow2D
236      SrcW = SrcZ;
237      break;
238    case 8: // ShadowRect
239      CTX = CTY = 0;
240      SrcW = SrcZ;
241      break;
242    case 9: // 1DArray
243      SrcZ = SrcY;
244      CTZ = 0;
245      break;
246    case 10: // 2DArray
247      CTZ = 0;
248      break;
249    case 11: // Shadow1DArray
250      SrcZ = SrcY;
251      CTZ = 0;
252      break;
253    case 12: // Shadow2DArray
254      CTZ = 0;
255      break;
256    }
257    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
258            .addOperand(MI->getOperand(3))
259            .addImm(SrcX)
260            .addImm(SrcY)
261            .addImm(SrcZ)
262            .addImm(SrcW)
263            .addImm(0)
264            .addImm(0)
265            .addImm(0)
266            .addImm(0)
267            .addImm(1)
268            .addImm(2)
269            .addImm(3)
270            .addOperand(RID)
271            .addOperand(SID)
272            .addImm(CTX)
273            .addImm(CTY)
274            .addImm(CTZ)
275            .addImm(CTW);
276    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
277            .addOperand(MI->getOperand(2))
278            .addImm(SrcX)
279            .addImm(SrcY)
280            .addImm(SrcZ)
281            .addImm(SrcW)
282            .addImm(0)
283            .addImm(0)
284            .addImm(0)
285            .addImm(0)
286            .addImm(1)
287            .addImm(2)
288            .addImm(3)
289            .addOperand(RID)
290            .addOperand(SID)
291            .addImm(CTX)
292            .addImm(CTY)
293            .addImm(CTZ)
294            .addImm(CTW);
295    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
296            .addOperand(MI->getOperand(0))
297            .addOperand(MI->getOperand(1))
298            .addImm(SrcX)
299            .addImm(SrcY)
300            .addImm(SrcZ)
301            .addImm(SrcW)
302            .addImm(0)
303            .addImm(0)
304            .addImm(0)
305            .addImm(0)
306            .addImm(1)
307            .addImm(2)
308            .addImm(3)
309            .addOperand(RID)
310            .addOperand(SID)
311            .addImm(CTX)
312            .addImm(CTY)
313            .addImm(CTZ)
314            .addImm(CTW)
315            .addReg(T0, RegState::Implicit)
316            .addReg(T1, RegState::Implicit);
317    break;
318  }
319
320  case AMDGPU::TXD_SHADOW: {
321    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
323    MachineOperand &RID = MI->getOperand(4);
324    MachineOperand &SID = MI->getOperand(5);
325    unsigned TextureId = MI->getOperand(6).getImm();
326    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
327    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
328
329    switch (TextureId) {
330    case 5: // Rect
331      CTX = CTY = 0;
332      break;
333    case 6: // Shadow1D
334      SrcW = SrcZ;
335      break;
336    case 7: // Shadow2D
337      SrcW = SrcZ;
338      break;
339    case 8: // ShadowRect
340      CTX = CTY = 0;
341      SrcW = SrcZ;
342      break;
343    case 9: // 1DArray
344      SrcZ = SrcY;
345      CTZ = 0;
346      break;
347    case 10: // 2DArray
348      CTZ = 0;
349      break;
350    case 11: // Shadow1DArray
351      SrcZ = SrcY;
352      CTZ = 0;
353      break;
354    case 12: // Shadow2DArray
355      CTZ = 0;
356      break;
357    }
358
359    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
360            .addOperand(MI->getOperand(3))
361            .addImm(SrcX)
362            .addImm(SrcY)
363            .addImm(SrcZ)
364            .addImm(SrcW)
365            .addImm(0)
366            .addImm(0)
367            .addImm(0)
368            .addImm(0)
369            .addImm(1)
370            .addImm(2)
371            .addImm(3)
372            .addOperand(RID)
373            .addOperand(SID)
374            .addImm(CTX)
375            .addImm(CTY)
376            .addImm(CTZ)
377            .addImm(CTW);
378    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
379            .addOperand(MI->getOperand(2))
380            .addImm(SrcX)
381            .addImm(SrcY)
382            .addImm(SrcZ)
383            .addImm(SrcW)
384            .addImm(0)
385            .addImm(0)
386            .addImm(0)
387            .addImm(0)
388            .addImm(1)
389            .addImm(2)
390            .addImm(3)
391            .addOperand(RID)
392            .addOperand(SID)
393            .addImm(CTX)
394            .addImm(CTY)
395            .addImm(CTZ)
396            .addImm(CTW);
397    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
398            .addOperand(MI->getOperand(0))
399            .addOperand(MI->getOperand(1))
400            .addImm(SrcX)
401            .addImm(SrcY)
402            .addImm(SrcZ)
403            .addImm(SrcW)
404            .addImm(0)
405            .addImm(0)
406            .addImm(0)
407            .addImm(0)
408            .addImm(1)
409            .addImm(2)
410            .addImm(3)
411            .addOperand(RID)
412            .addOperand(SID)
413            .addImm(CTX)
414            .addImm(CTY)
415            .addImm(CTZ)
416            .addImm(CTW)
417            .addReg(T0, RegState::Implicit)
418            .addReg(T1, RegState::Implicit);
419    break;
420  }
421
422  case AMDGPU::BRANCH:
423      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
424              .addOperand(MI->getOperand(0));
425      break;
426
427  case AMDGPU::BRANCH_COND_f32: {
428    MachineInstr *NewMI =
429      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
430              AMDGPU::PREDICATE_BIT)
431              .addOperand(MI->getOperand(1))
432              .addImm(OPCODE_IS_NOT_ZERO)
433              .addImm(0); // Flags
434    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
435    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
436            .addOperand(MI->getOperand(0))
437            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
438    break;
439  }
440
441  case AMDGPU::BRANCH_COND_i32: {
442    MachineInstr *NewMI =
443      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
444            AMDGPU::PREDICATE_BIT)
445            .addOperand(MI->getOperand(1))
446            .addImm(OPCODE_IS_NOT_ZERO_INT)
447            .addImm(0); // Flags
448    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
449    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
450           .addOperand(MI->getOperand(0))
451            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
452    break;
453  }
454
455  case AMDGPU::EG_ExportSwz:
456  case AMDGPU::R600_ExportSwz: {
457    // Instruction is left unmodified if its not the last one of its type
458    bool isLastInstructionOfItsType = true;
459    unsigned InstExportType = MI->getOperand(1).getImm();
460    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
461         EndBlock = BB->end(); NextExportInst != EndBlock;
462         NextExportInst = std::next(NextExportInst)) {
463      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
464          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
465        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
466            .getImm();
467        if (CurrentInstExportType == InstExportType) {
468          isLastInstructionOfItsType = false;
469          break;
470        }
471      }
472    }
473    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
474    if (!EOP && !isLastInstructionOfItsType)
475      return BB;
476    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
477    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
478            .addOperand(MI->getOperand(0))
479            .addOperand(MI->getOperand(1))
480            .addOperand(MI->getOperand(2))
481            .addOperand(MI->getOperand(3))
482            .addOperand(MI->getOperand(4))
483            .addOperand(MI->getOperand(5))
484            .addOperand(MI->getOperand(6))
485            .addImm(CfInst)
486            .addImm(EOP);
487    break;
488  }
489  case AMDGPU::RETURN: {
490    // RETURN instructions must have the live-out registers as implicit uses,
491    // otherwise they appear dead.
492    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
493    MachineInstrBuilder MIB(*MF, MI);
494    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
495      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
496    return BB;
497  }
498  }
499
500  MI->eraseFromParent();
501  return BB;
502}
503
504//===----------------------------------------------------------------------===//
505// Custom DAG Lowering Operations
506//===----------------------------------------------------------------------===//
507
508SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
509  MachineFunction &MF = DAG.getMachineFunction();
510  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
511  switch (Op.getOpcode()) {
512  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
513  case ISD::FCOS:
514  case ISD::FSIN: return LowerTrig(Op, DAG);
515  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
516  case ISD::STORE: return LowerSTORE(Op, DAG);
517  case ISD::LOAD: return LowerLOAD(Op, DAG);
518  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
519  case ISD::INTRINSIC_VOID: {
520    SDValue Chain = Op.getOperand(0);
521    unsigned IntrinsicID =
522                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
523    switch (IntrinsicID) {
524    case AMDGPUIntrinsic::AMDGPU_store_output: {
525      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
526      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
527      MFI->LiveOuts.push_back(Reg);
528      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
529    }
530    case AMDGPUIntrinsic::R600_store_swizzle: {
531      const SDValue Args[8] = {
532        Chain,
533        Op.getOperand(2), // Export Value
534        Op.getOperand(3), // ArrayBase
535        Op.getOperand(4), // Type
536        DAG.getConstant(0, MVT::i32), // SWZ_X
537        DAG.getConstant(1, MVT::i32), // SWZ_Y
538        DAG.getConstant(2, MVT::i32), // SWZ_Z
539        DAG.getConstant(3, MVT::i32) // SWZ_W
540      };
541      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
542          Args, 8);
543    }
544
545    // default for switch(IntrinsicID)
546    default: break;
547    }
548    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
549    break;
550  }
551  case ISD::INTRINSIC_WO_CHAIN: {
552    unsigned IntrinsicID =
553                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
554    EVT VT = Op.getValueType();
555    SDLoc DL(Op);
556    switch(IntrinsicID) {
557    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
558    case AMDGPUIntrinsic::R600_load_input: {
559      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
560      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
561      MachineFunction &MF = DAG.getMachineFunction();
562      MachineRegisterInfo &MRI = MF.getRegInfo();
563      MRI.addLiveIn(Reg);
564      return DAG.getCopyFromReg(DAG.getEntryNode(),
565          SDLoc(DAG.getEntryNode()), Reg, VT);
566    }
567
568    case AMDGPUIntrinsic::R600_interp_input: {
569      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
570      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
571      MachineSDNode *interp;
572      if (ijb < 0) {
573        const MachineFunction &MF = DAG.getMachineFunction();
574        const R600InstrInfo *TII =
575          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
576        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
577            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
578        return DAG.getTargetExtractSubreg(
579            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
580            DL, MVT::f32, SDValue(interp, 0));
581      }
582      MachineFunction &MF = DAG.getMachineFunction();
583      MachineRegisterInfo &MRI = MF.getRegInfo();
584      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
585      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
586      MRI.addLiveIn(RegisterI);
587      MRI.addLiveIn(RegisterJ);
588      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
589          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
590      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
591          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
592
593      if (slot % 4 < 2)
594        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
595            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
596            RegisterJNode, RegisterINode);
597      else
598        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
599            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
600            RegisterJNode, RegisterINode);
601      return SDValue(interp, slot % 2);
602    }
603    case AMDGPUIntrinsic::R600_interp_xy:
604    case AMDGPUIntrinsic::R600_interp_zw: {
605      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606      MachineSDNode *interp;
607      SDValue RegisterINode = Op.getOperand(2);
608      SDValue RegisterJNode = Op.getOperand(3);
609
610      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
611        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
612            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
613            RegisterJNode, RegisterINode);
614      else
615        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
616            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
617            RegisterJNode, RegisterINode);
618      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
619          SDValue(interp, 0), SDValue(interp, 1));
620    }
621    case AMDGPUIntrinsic::R600_tex:
622    case AMDGPUIntrinsic::R600_texc:
623    case AMDGPUIntrinsic::R600_txl:
624    case AMDGPUIntrinsic::R600_txlc:
625    case AMDGPUIntrinsic::R600_txb:
626    case AMDGPUIntrinsic::R600_txbc:
627    case AMDGPUIntrinsic::R600_txf:
628    case AMDGPUIntrinsic::R600_txq:
629    case AMDGPUIntrinsic::R600_ddx:
630    case AMDGPUIntrinsic::R600_ddy:
631    case AMDGPUIntrinsic::R600_ldptr: {
632      unsigned TextureOp;
633      switch (IntrinsicID) {
634      case AMDGPUIntrinsic::R600_tex:
635        TextureOp = 0;
636        break;
637      case AMDGPUIntrinsic::R600_texc:
638        TextureOp = 1;
639        break;
640      case AMDGPUIntrinsic::R600_txl:
641        TextureOp = 2;
642        break;
643      case AMDGPUIntrinsic::R600_txlc:
644        TextureOp = 3;
645        break;
646      case AMDGPUIntrinsic::R600_txb:
647        TextureOp = 4;
648        break;
649      case AMDGPUIntrinsic::R600_txbc:
650        TextureOp = 5;
651        break;
652      case AMDGPUIntrinsic::R600_txf:
653        TextureOp = 6;
654        break;
655      case AMDGPUIntrinsic::R600_txq:
656        TextureOp = 7;
657        break;
658      case AMDGPUIntrinsic::R600_ddx:
659        TextureOp = 8;
660        break;
661      case AMDGPUIntrinsic::R600_ddy:
662        TextureOp = 9;
663        break;
664      case AMDGPUIntrinsic::R600_ldptr:
665        TextureOp = 10;
666        break;
667      default:
668        llvm_unreachable("Unknow Texture Operation");
669      }
670
671      SDValue TexArgs[19] = {
672        DAG.getConstant(TextureOp, MVT::i32),
673        Op.getOperand(1),
674        DAG.getConstant(0, MVT::i32),
675        DAG.getConstant(1, MVT::i32),
676        DAG.getConstant(2, MVT::i32),
677        DAG.getConstant(3, MVT::i32),
678        Op.getOperand(2),
679        Op.getOperand(3),
680        Op.getOperand(4),
681        DAG.getConstant(0, MVT::i32),
682        DAG.getConstant(1, MVT::i32),
683        DAG.getConstant(2, MVT::i32),
684        DAG.getConstant(3, MVT::i32),
685        Op.getOperand(5),
686        Op.getOperand(6),
687        Op.getOperand(7),
688        Op.getOperand(8),
689        Op.getOperand(9),
690        Op.getOperand(10)
691      };
692      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
693    }
694    case AMDGPUIntrinsic::AMDGPU_dp4: {
695      SDValue Args[8] = {
696      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
697          DAG.getConstant(0, MVT::i32)),
698      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
699          DAG.getConstant(0, MVT::i32)),
700      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
701          DAG.getConstant(1, MVT::i32)),
702      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
703          DAG.getConstant(1, MVT::i32)),
704      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
705          DAG.getConstant(2, MVT::i32)),
706      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
707          DAG.getConstant(2, MVT::i32)),
708      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
709          DAG.getConstant(3, MVT::i32)),
710      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
711          DAG.getConstant(3, MVT::i32))
712      };
713      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
714    }
715
716    case Intrinsic::r600_read_ngroups_x:
717      return LowerImplicitParameter(DAG, VT, DL, 0);
718    case Intrinsic::r600_read_ngroups_y:
719      return LowerImplicitParameter(DAG, VT, DL, 1);
720    case Intrinsic::r600_read_ngroups_z:
721      return LowerImplicitParameter(DAG, VT, DL, 2);
722    case Intrinsic::r600_read_global_size_x:
723      return LowerImplicitParameter(DAG, VT, DL, 3);
724    case Intrinsic::r600_read_global_size_y:
725      return LowerImplicitParameter(DAG, VT, DL, 4);
726    case Intrinsic::r600_read_global_size_z:
727      return LowerImplicitParameter(DAG, VT, DL, 5);
728    case Intrinsic::r600_read_local_size_x:
729      return LowerImplicitParameter(DAG, VT, DL, 6);
730    case Intrinsic::r600_read_local_size_y:
731      return LowerImplicitParameter(DAG, VT, DL, 7);
732    case Intrinsic::r600_read_local_size_z:
733      return LowerImplicitParameter(DAG, VT, DL, 8);
734
735    case Intrinsic::r600_read_tgid_x:
736      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
737                                  AMDGPU::T1_X, VT);
738    case Intrinsic::r600_read_tgid_y:
739      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
740                                  AMDGPU::T1_Y, VT);
741    case Intrinsic::r600_read_tgid_z:
742      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
743                                  AMDGPU::T1_Z, VT);
744    case Intrinsic::r600_read_tidig_x:
745      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
746                                  AMDGPU::T0_X, VT);
747    case Intrinsic::r600_read_tidig_y:
748      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
749                                  AMDGPU::T0_Y, VT);
750    case Intrinsic::r600_read_tidig_z:
751      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
752                                  AMDGPU::T0_Z, VT);
753    }
754    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
755    break;
756  }
757  } // end switch(Op.getOpcode())
758  return SDValue();
759}
760
761void R600TargetLowering::ReplaceNodeResults(SDNode *N,
762                                            SmallVectorImpl<SDValue> &Results,
763                                            SelectionDAG &DAG) const {
764  switch (N->getOpcode()) {
765  default:
766    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
767    return;
768  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
769    return;
770  case ISD::LOAD: {
771    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
772    Results.push_back(SDValue(Node, 0));
773    Results.push_back(SDValue(Node, 1));
774    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
775    // function
776    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
777    return;
778  }
779  case ISD::STORE:
780    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
781    Results.push_back(SDValue(Node, 0));
782    return;
783  }
784}
785
786SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
787  // On hw >= R700, COS/SIN input must be between -1. and 1.
788  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
789  EVT VT = Op.getValueType();
790  SDValue Arg = Op.getOperand(0);
791  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
792      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
793        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
794          DAG.getConstantFP(0.15915494309, MVT::f32)),
795        DAG.getConstantFP(0.5, MVT::f32)));
796  unsigned TrigNode;
797  switch (Op.getOpcode()) {
798  case ISD::FCOS:
799    TrigNode = AMDGPUISD::COS_HW;
800    break;
801  case ISD::FSIN:
802    TrigNode = AMDGPUISD::SIN_HW;
803    break;
804  default:
805    llvm_unreachable("Wrong trig opcode");
806  }
807  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
808      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
809        DAG.getConstantFP(-0.5, MVT::f32)));
810  if (Gen >= AMDGPUSubtarget::R700)
811    return TrigVal;
812  // On R600 hw, COS/SIN input must be between -Pi and Pi.
813  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
814      DAG.getConstantFP(3.14159265359, MVT::f32));
815}
816
817SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
818  return DAG.getNode(
819      ISD::SETCC,
820      SDLoc(Op),
821      MVT::i1,
822      Op, DAG.getConstantFP(0.0f, MVT::f32),
823      DAG.getCondCode(ISD::SETNE)
824      );
825}
826
827SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
828                                                   SDLoc DL,
829                                                   unsigned DwordOffset) const {
830  unsigned ByteOffset = DwordOffset * 4;
831  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
832                                      AMDGPUAS::CONSTANT_BUFFER_0);
833
834  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
835  assert(isInt<16>(ByteOffset));
836
837  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
838                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
839                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
840                     false, false, false, 0);
841}
842
843bool R600TargetLowering::isZero(SDValue Op) const {
844  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
845    return Cst->isNullValue();
846  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
847    return CstFP->isZero();
848  } else {
849    return false;
850  }
851}
852
853SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
854  SDLoc DL(Op);
855  EVT VT = Op.getValueType();
856
857  SDValue LHS = Op.getOperand(0);
858  SDValue RHS = Op.getOperand(1);
859  SDValue True = Op.getOperand(2);
860  SDValue False = Op.getOperand(3);
861  SDValue CC = Op.getOperand(4);
862  SDValue Temp;
863
864  // LHS and RHS are guaranteed to be the same value type
865  EVT CompareVT = LHS.getValueType();
866
867  // Check if we can lower this to a native operation.
868
869  // Try to lower to a SET* instruction:
870  //
871  // SET* can match the following patterns:
872  //
873  // select_cc f32, f32, -1,  0, cc_supported
874  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
875  // select_cc i32, i32, -1,  0, cc_supported
876  //
877
878  // Move hardware True/False values to the correct operand.
879  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
880  ISD::CondCode InverseCC =
881     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
882  if (isHWTrueValue(False) && isHWFalseValue(True)) {
883    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
884      std::swap(False, True);
885      CC = DAG.getCondCode(InverseCC);
886    } else {
887      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
888      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
889        std::swap(False, True);
890        std::swap(LHS, RHS);
891        CC = DAG.getCondCode(SwapInvCC);
892      }
893    }
894  }
895
896  if (isHWTrueValue(True) && isHWFalseValue(False) &&
897      (CompareVT == VT || VT == MVT::i32)) {
898    // This can be matched by a SET* instruction.
899    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
900  }
901
902  // Try to lower to a CND* instruction:
903  //
904  // CND* can match the following patterns:
905  //
906  // select_cc f32, 0.0, f32, f32, cc_supported
907  // select_cc f32, 0.0, i32, i32, cc_supported
908  // select_cc i32, 0,   f32, f32, cc_supported
909  // select_cc i32, 0,   i32, i32, cc_supported
910  //
911
912  // Try to move the zero value to the RHS
913  if (isZero(LHS)) {
914    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
915    // Try swapping the operands
916    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
917    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
918      std::swap(LHS, RHS);
919      CC = DAG.getCondCode(CCSwapped);
920    } else {
921      // Try inverting the conditon and then swapping the operands
922      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
923      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
924      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
925        std::swap(True, False);
926        std::swap(LHS, RHS);
927        CC = DAG.getCondCode(CCSwapped);
928      }
929    }
930  }
931  if (isZero(RHS)) {
932    SDValue Cond = LHS;
933    SDValue Zero = RHS;
934    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
935    if (CompareVT != VT) {
936      // Bitcast True / False to the correct types.  This will end up being
937      // a nop, but it allows us to define only a single pattern in the
938      // .TD files for each CND* instruction rather than having to have
939      // one pattern for integer True/False and one for fp True/False
940      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
941      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
942    }
943
944    switch (CCOpcode) {
945    case ISD::SETONE:
946    case ISD::SETUNE:
947    case ISD::SETNE:
948      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
949      Temp = True;
950      True = False;
951      False = Temp;
952      break;
953    default:
954      break;
955    }
956    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
957        Cond, Zero,
958        True, False,
959        DAG.getCondCode(CCOpcode));
960    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
961  }
962
963
964  // Possible Min/Max pattern
965  SDValue MinMax = LowerMinMax(Op, DAG);
966  if (MinMax.getNode()) {
967    return MinMax;
968  }
969
970  // If we make it this for it means we have no native instructions to handle
971  // this SELECT_CC, so we must lower it.
972  SDValue HWTrue, HWFalse;
973
974  if (CompareVT == MVT::f32) {
975    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
976    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
977  } else if (CompareVT == MVT::i32) {
978    HWTrue = DAG.getConstant(-1, CompareVT);
979    HWFalse = DAG.getConstant(0, CompareVT);
980  }
981  else {
982    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
983  }
984
985  // Lower this unsupported SELECT_CC into a combination of two supported
986  // SELECT_CC operations.
987  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
988
989  return DAG.getNode(ISD::SELECT_CC, DL, VT,
990      Cond, HWFalse,
991      True, False,
992      DAG.getCondCode(ISD::SETNE));
993}
994
995/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
996/// convert these pointers to a register index.  Each register holds
997/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
998/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
999/// for indirect addressing.
1000SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1001                                               unsigned StackWidth,
1002                                               SelectionDAG &DAG) const {
1003  unsigned SRLPad;
1004  switch(StackWidth) {
1005  case 1:
1006    SRLPad = 2;
1007    break;
1008  case 2:
1009    SRLPad = 3;
1010    break;
1011  case 4:
1012    SRLPad = 4;
1013    break;
1014  default: llvm_unreachable("Invalid stack width");
1015  }
1016
1017  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1018                     DAG.getConstant(SRLPad, MVT::i32));
1019}
1020
1021void R600TargetLowering::getStackAddress(unsigned StackWidth,
1022                                         unsigned ElemIdx,
1023                                         unsigned &Channel,
1024                                         unsigned &PtrIncr) const {
1025  switch (StackWidth) {
1026  default:
1027  case 1:
1028    Channel = 0;
1029    if (ElemIdx > 0) {
1030      PtrIncr = 1;
1031    } else {
1032      PtrIncr = 0;
1033    }
1034    break;
1035  case 2:
1036    Channel = ElemIdx % 2;
1037    if (ElemIdx == 2) {
1038      PtrIncr = 1;
1039    } else {
1040      PtrIncr = 0;
1041    }
1042    break;
1043  case 4:
1044    Channel = ElemIdx;
1045    PtrIncr = 0;
1046    break;
1047  }
1048}
1049
1050SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1051  SDLoc DL(Op);
1052  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1053  SDValue Chain = Op.getOperand(0);
1054  SDValue Value = Op.getOperand(1);
1055  SDValue Ptr = Op.getOperand(2);
1056
1057  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1058  if (Result.getNode()) {
1059    return Result;
1060  }
1061
1062  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1063    if (StoreNode->isTruncatingStore()) {
1064      EVT VT = Value.getValueType();
1065      assert(VT.bitsLE(MVT::i32));
1066      EVT MemVT = StoreNode->getMemoryVT();
1067      SDValue MaskConstant;
1068      if (MemVT == MVT::i8) {
1069        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1070      } else {
1071        assert(MemVT == MVT::i16);
1072        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1073      }
1074      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1075                                      DAG.getConstant(2, MVT::i32));
1076      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1077                                      DAG.getConstant(0x00000003, VT));
1078      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1079      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1080                                   DAG.getConstant(3, VT));
1081      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1082      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1083      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1084      // vector instead.
1085      SDValue Src[4] = {
1086        ShiftedValue,
1087        DAG.getConstant(0, MVT::i32),
1088        DAG.getConstant(0, MVT::i32),
1089        Mask
1090      };
1091      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1092      SDValue Args[3] = { Chain, Input, DWordAddr };
1093      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1094                                     Op->getVTList(), Args, 3, MemVT,
1095                                     StoreNode->getMemOperand());
1096    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1097               Value.getValueType().bitsGE(MVT::i32)) {
1098      // Convert pointer from byte address to dword address.
1099      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1100                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1101                                    Ptr, DAG.getConstant(2, MVT::i32)));
1102
1103      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1104        llvm_unreachable("Truncated and indexed stores not supported yet");
1105      } else {
1106        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1107      }
1108      return Chain;
1109    }
1110  }
1111
1112  EVT ValueVT = Value.getValueType();
1113
1114  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1115    return SDValue();
1116  }
1117
1118  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1119  if (Ret.getNode()) {
1120    return Ret;
1121  }
1122  // Lowering for indirect addressing
1123
1124  const MachineFunction &MF = DAG.getMachineFunction();
1125  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1126                                         getTargetMachine().getFrameLowering());
1127  unsigned StackWidth = TFL->getStackWidth(MF);
1128
1129  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1130
1131  if (ValueVT.isVector()) {
1132    unsigned NumElemVT = ValueVT.getVectorNumElements();
1133    EVT ElemVT = ValueVT.getVectorElementType();
1134    SDValue Stores[4];
1135
1136    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1137                                      "vector width in load");
1138
1139    for (unsigned i = 0; i < NumElemVT; ++i) {
1140      unsigned Channel, PtrIncr;
1141      getStackAddress(StackWidth, i, Channel, PtrIncr);
1142      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1143                        DAG.getConstant(PtrIncr, MVT::i32));
1144      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1145                                 Value, DAG.getConstant(i, MVT::i32));
1146
1147      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1148                              Chain, Elem, Ptr,
1149                              DAG.getTargetConstant(Channel, MVT::i32));
1150    }
1151     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1152   } else {
1153    if (ValueVT == MVT::i8) {
1154      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1155    }
1156    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1157    DAG.getTargetConstant(0, MVT::i32)); // Channel
1158  }
1159
1160  return Chain;
1161}
1162
1163// return (512 + (kc_bank << 12)
1164static int
1165ConstantAddressBlock(unsigned AddressSpace) {
1166  switch (AddressSpace) {
1167  case AMDGPUAS::CONSTANT_BUFFER_0:
1168    return 512;
1169  case AMDGPUAS::CONSTANT_BUFFER_1:
1170    return 512 + 4096;
1171  case AMDGPUAS::CONSTANT_BUFFER_2:
1172    return 512 + 4096 * 2;
1173  case AMDGPUAS::CONSTANT_BUFFER_3:
1174    return 512 + 4096 * 3;
1175  case AMDGPUAS::CONSTANT_BUFFER_4:
1176    return 512 + 4096 * 4;
1177  case AMDGPUAS::CONSTANT_BUFFER_5:
1178    return 512 + 4096 * 5;
1179  case AMDGPUAS::CONSTANT_BUFFER_6:
1180    return 512 + 4096 * 6;
1181  case AMDGPUAS::CONSTANT_BUFFER_7:
1182    return 512 + 4096 * 7;
1183  case AMDGPUAS::CONSTANT_BUFFER_8:
1184    return 512 + 4096 * 8;
1185  case AMDGPUAS::CONSTANT_BUFFER_9:
1186    return 512 + 4096 * 9;
1187  case AMDGPUAS::CONSTANT_BUFFER_10:
1188    return 512 + 4096 * 10;
1189  case AMDGPUAS::CONSTANT_BUFFER_11:
1190    return 512 + 4096 * 11;
1191  case AMDGPUAS::CONSTANT_BUFFER_12:
1192    return 512 + 4096 * 12;
1193  case AMDGPUAS::CONSTANT_BUFFER_13:
1194    return 512 + 4096 * 13;
1195  case AMDGPUAS::CONSTANT_BUFFER_14:
1196    return 512 + 4096 * 14;
1197  case AMDGPUAS::CONSTANT_BUFFER_15:
1198    return 512 + 4096 * 15;
1199  default:
1200    return -1;
1201  }
1202}
1203
1204SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1205{
1206  EVT VT = Op.getValueType();
1207  SDLoc DL(Op);
1208  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1209  SDValue Chain = Op.getOperand(0);
1210  SDValue Ptr = Op.getOperand(1);
1211  SDValue LoweredLoad;
1212
1213  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1214  if (Ret.getNode()) {
1215    SDValue Ops[2];
1216    Ops[0] = Ret;
1217    Ops[1] = Chain;
1218    return DAG.getMergeValues(Ops, 2, DL);
1219  }
1220
1221
1222  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1223    SDValue MergedValues[2] = {
1224      SplitVectorLoad(Op, DAG),
1225      Chain
1226    };
1227    return DAG.getMergeValues(MergedValues, 2, DL);
1228  }
1229
1230  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1231  if (ConstantBlock > -1 &&
1232      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1233       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1234    SDValue Result;
1235    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1236        isa<Constant>(LoadNode->getSrcValue()) ||
1237        isa<ConstantSDNode>(Ptr)) {
1238      SDValue Slots[4];
1239      for (unsigned i = 0; i < 4; i++) {
1240        // We want Const position encoded with the following formula :
1241        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1242        // const_index is Ptr computed by llvm using an alignment of 16.
1243        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1244        // then div by 4 at the ISel step
1245        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1246            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1247        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1248      }
1249      EVT NewVT = MVT::v4i32;
1250      unsigned NumElements = 4;
1251      if (VT.isVector()) {
1252        NewVT = VT;
1253        NumElements = VT.getVectorNumElements();
1254      }
1255      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1256    } else {
1257      // non-constant ptr can't be folded, keeps it as a v4f32 load
1258      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1259          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1260          DAG.getConstant(LoadNode->getAddressSpace() -
1261                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1262          );
1263    }
1264
1265    if (!VT.isVector()) {
1266      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1267          DAG.getConstant(0, MVT::i32));
1268    }
1269
1270    SDValue MergedValues[2] = {
1271        Result,
1272        Chain
1273    };
1274    return DAG.getMergeValues(MergedValues, 2, DL);
1275  }
1276
1277  // For most operations returning SDValue() will result in the node being
1278  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1279  // need to manually expand loads that may be legal in some address spaces and
1280  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1281  // compute shaders, since the data is sign extended when it is uploaded to the
1282  // buffer. However SEXT loads from other address spaces are not supported, so
1283  // we need to expand them here.
1284  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1285    EVT MemVT = LoadNode->getMemoryVT();
1286    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1287    SDValue ShiftAmount =
1288          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1289    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1290                                  LoadNode->getPointerInfo(), MemVT,
1291                                  LoadNode->isVolatile(),
1292                                  LoadNode->isNonTemporal(),
1293                                  LoadNode->getAlignment());
1294    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1295    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1296
1297    SDValue MergedValues[2] = { Sra, Chain };
1298    return DAG.getMergeValues(MergedValues, 2, DL);
1299  }
1300
1301  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1302    return SDValue();
1303  }
1304
1305  // Lowering for indirect addressing
1306  const MachineFunction &MF = DAG.getMachineFunction();
1307  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1308                                         getTargetMachine().getFrameLowering());
1309  unsigned StackWidth = TFL->getStackWidth(MF);
1310
1311  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1312
1313  if (VT.isVector()) {
1314    unsigned NumElemVT = VT.getVectorNumElements();
1315    EVT ElemVT = VT.getVectorElementType();
1316    SDValue Loads[4];
1317
1318    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1319                                      "vector width in load");
1320
1321    for (unsigned i = 0; i < NumElemVT; ++i) {
1322      unsigned Channel, PtrIncr;
1323      getStackAddress(StackWidth, i, Channel, PtrIncr);
1324      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1325                        DAG.getConstant(PtrIncr, MVT::i32));
1326      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1327                             Chain, Ptr,
1328                             DAG.getTargetConstant(Channel, MVT::i32),
1329                             Op.getOperand(2));
1330    }
1331    for (unsigned i = NumElemVT; i < 4; ++i) {
1332      Loads[i] = DAG.getUNDEF(ElemVT);
1333    }
1334    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1335    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1336  } else {
1337    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1338                              Chain, Ptr,
1339                              DAG.getTargetConstant(0, MVT::i32), // Channel
1340                              Op.getOperand(2));
1341  }
1342
1343  SDValue Ops[2];
1344  Ops[0] = LoweredLoad;
1345  Ops[1] = Chain;
1346
1347  return DAG.getMergeValues(Ops, 2, DL);
1348}
1349
1350/// XXX Only kernel functions are supported, so we can assume for now that
1351/// every function is a kernel function, but in the future we should use
1352/// separate calling conventions for kernel and non-kernel functions.
1353SDValue R600TargetLowering::LowerFormalArguments(
1354                                      SDValue Chain,
1355                                      CallingConv::ID CallConv,
1356                                      bool isVarArg,
1357                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1358                                      SDLoc DL, SelectionDAG &DAG,
1359                                      SmallVectorImpl<SDValue> &InVals) const {
1360  SmallVector<CCValAssign, 16> ArgLocs;
1361  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1362                 getTargetMachine(), ArgLocs, *DAG.getContext());
1363  MachineFunction &MF = DAG.getMachineFunction();
1364  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1365
1366  SmallVector<ISD::InputArg, 8> LocalIns;
1367
1368  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1369                          LocalIns);
1370
1371  AnalyzeFormalArguments(CCInfo, LocalIns);
1372
1373  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1374    CCValAssign &VA = ArgLocs[i];
1375    EVT VT = Ins[i].VT;
1376    EVT MemVT = LocalIns[i].VT;
1377
1378    if (ShaderType != ShaderType::COMPUTE) {
1379      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1380      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1381      InVals.push_back(Register);
1382      continue;
1383    }
1384
1385    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1386                                                   AMDGPUAS::CONSTANT_BUFFER_0);
1387
1388    // i64 isn't a legal type, so the register type used ends up as i32, which
1389    // isn't expected here. It attempts to create this sextload, but it ends up
1390    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1391    // for <1 x i64>.
1392
1393    // The first 36 bytes of the input buffer contains information about
1394    // thread group and global sizes.
1395    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1396                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1397                                 MachinePointerInfo(UndefValue::get(PtrTy)),
1398                                 MemVT, false, false, 4);
1399    // 4 is the preferred alignment for
1400    // the CONSTANT memory space.
1401    InVals.push_back(Arg);
1402  }
1403  return Chain;
1404}
1405
1406EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1407   if (!VT.isVector()) return MVT::i32;
1408   return VT.changeVectorElementTypeToInteger();
1409}
1410
1411static SDValue
1412CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1413                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
1414  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1415  assert(RemapSwizzle.empty());
1416  SDValue NewBldVec[4] = {
1417      VectorEntry.getOperand(0),
1418      VectorEntry.getOperand(1),
1419      VectorEntry.getOperand(2),
1420      VectorEntry.getOperand(3)
1421  };
1422
1423  for (unsigned i = 0; i < 4; i++) {
1424    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1425      // We mask write here to teach later passes that the ith element of this
1426      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1427      // break false dependencies and additionnaly make assembly easier to read.
1428      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1429    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1430      if (C->isZero()) {
1431        RemapSwizzle[i] = 4; // SEL_0
1432        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1433      } else if (C->isExactlyValue(1.0)) {
1434        RemapSwizzle[i] = 5; // SEL_1
1435        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1436      }
1437    }
1438
1439    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1440      continue;
1441    for (unsigned j = 0; j < i; j++) {
1442      if (NewBldVec[i] == NewBldVec[j]) {
1443        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1444        RemapSwizzle[i] = j;
1445        break;
1446      }
1447    }
1448  }
1449
1450  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1451      VectorEntry.getValueType(), NewBldVec, 4);
1452}
1453
1454static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1455                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1456  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1457  assert(RemapSwizzle.empty());
1458  SDValue NewBldVec[4] = {
1459      VectorEntry.getOperand(0),
1460      VectorEntry.getOperand(1),
1461      VectorEntry.getOperand(2),
1462      VectorEntry.getOperand(3)
1463  };
1464  bool isUnmovable[4] = { false, false, false, false };
1465  for (unsigned i = 0; i < 4; i++) {
1466    RemapSwizzle[i] = i;
1467    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1468      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1469          ->getZExtValue();
1470      if (i == Idx)
1471        isUnmovable[Idx] = true;
1472    }
1473  }
1474
1475  for (unsigned i = 0; i < 4; i++) {
1476    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1477      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1478          ->getZExtValue();
1479      if (isUnmovable[Idx])
1480        continue;
1481      // Swap i and Idx
1482      std::swap(NewBldVec[Idx], NewBldVec[i]);
1483      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1484      break;
1485    }
1486  }
1487
1488  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1489      VectorEntry.getValueType(), NewBldVec, 4);
1490}
1491
1492
1493SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1494SDValue Swz[4], SelectionDAG &DAG) const {
1495  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1496  // Old -> New swizzle values
1497  DenseMap<unsigned, unsigned> SwizzleRemap;
1498
1499  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1500  for (unsigned i = 0; i < 4; i++) {
1501    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1502    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1503      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1504  }
1505
1506  SwizzleRemap.clear();
1507  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1508  for (unsigned i = 0; i < 4; i++) {
1509    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1510    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1511      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1512  }
1513
1514  return BuildVector;
1515}
1516
1517
1518//===----------------------------------------------------------------------===//
1519// Custom DAG Optimizations
1520//===----------------------------------------------------------------------===//
1521
1522SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1523                                              DAGCombinerInfo &DCI) const {
1524  SelectionDAG &DAG = DCI.DAG;
1525
1526  switch (N->getOpcode()) {
1527  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1528  case ISD::FP_ROUND: {
1529      SDValue Arg = N->getOperand(0);
1530      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1531        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1532                           Arg.getOperand(0));
1533      }
1534      break;
1535    }
1536
1537  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1538  // (i32 select_cc f32, f32, -1, 0 cc)
1539  //
1540  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1541  // this to one of the SET*_DX10 instructions.
1542  case ISD::FP_TO_SINT: {
1543    SDValue FNeg = N->getOperand(0);
1544    if (FNeg.getOpcode() != ISD::FNEG) {
1545      return SDValue();
1546    }
1547    SDValue SelectCC = FNeg.getOperand(0);
1548    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1549        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1550        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1551        !isHWTrueValue(SelectCC.getOperand(2)) ||
1552        !isHWFalseValue(SelectCC.getOperand(3))) {
1553      return SDValue();
1554    }
1555
1556    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1557                           SelectCC.getOperand(0), // LHS
1558                           SelectCC.getOperand(1), // RHS
1559                           DAG.getConstant(-1, MVT::i32), // True
1560                           DAG.getConstant(0, MVT::i32),  // Flase
1561                           SelectCC.getOperand(4)); // CC
1562
1563    break;
1564  }
1565
1566  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1567  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1568  case ISD::INSERT_VECTOR_ELT: {
1569    SDValue InVec = N->getOperand(0);
1570    SDValue InVal = N->getOperand(1);
1571    SDValue EltNo = N->getOperand(2);
1572    SDLoc dl(N);
1573
1574    // If the inserted element is an UNDEF, just use the input vector.
1575    if (InVal.getOpcode() == ISD::UNDEF)
1576      return InVec;
1577
1578    EVT VT = InVec.getValueType();
1579
1580    // If we can't generate a legal BUILD_VECTOR, exit
1581    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1582      return SDValue();
1583
1584    // Check that we know which element is being inserted
1585    if (!isa<ConstantSDNode>(EltNo))
1586      return SDValue();
1587    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1588
1589    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1590    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1591    // vector elements.
1592    SmallVector<SDValue, 8> Ops;
1593    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1594      Ops.append(InVec.getNode()->op_begin(),
1595                 InVec.getNode()->op_end());
1596    } else if (InVec.getOpcode() == ISD::UNDEF) {
1597      unsigned NElts = VT.getVectorNumElements();
1598      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1599    } else {
1600      return SDValue();
1601    }
1602
1603    // Insert the element
1604    if (Elt < Ops.size()) {
1605      // All the operands of BUILD_VECTOR must have the same type;
1606      // we enforce that here.
1607      EVT OpVT = Ops[0].getValueType();
1608      if (InVal.getValueType() != OpVT)
1609        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1610          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1611          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1612      Ops[Elt] = InVal;
1613    }
1614
1615    // Return the new vector
1616    return DAG.getNode(ISD::BUILD_VECTOR, dl,
1617                       VT, &Ops[0], Ops.size());
1618  }
1619
1620  // Extract_vec (Build_vector) generated by custom lowering
1621  // also needs to be customly combined
1622  case ISD::EXTRACT_VECTOR_ELT: {
1623    SDValue Arg = N->getOperand(0);
1624    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1625      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1626        unsigned Element = Const->getZExtValue();
1627        return Arg->getOperand(Element);
1628      }
1629    }
1630    if (Arg.getOpcode() == ISD::BITCAST &&
1631        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1632      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1633        unsigned Element = Const->getZExtValue();
1634        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1635            Arg->getOperand(0).getOperand(Element));
1636      }
1637    }
1638  }
1639
1640  case ISD::SELECT_CC: {
1641    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1642    //      selectcc x, y, a, b, inv(cc)
1643    //
1644    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1645    //      selectcc x, y, a, b, cc
1646    SDValue LHS = N->getOperand(0);
1647    if (LHS.getOpcode() != ISD::SELECT_CC) {
1648      return SDValue();
1649    }
1650
1651    SDValue RHS = N->getOperand(1);
1652    SDValue True = N->getOperand(2);
1653    SDValue False = N->getOperand(3);
1654    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1655
1656    if (LHS.getOperand(2).getNode() != True.getNode() ||
1657        LHS.getOperand(3).getNode() != False.getNode() ||
1658        RHS.getNode() != False.getNode()) {
1659      return SDValue();
1660    }
1661
1662    switch (NCC) {
1663    default: return SDValue();
1664    case ISD::SETNE: return LHS;
1665    case ISD::SETEQ: {
1666      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1667      LHSCC = ISD::getSetCCInverse(LHSCC,
1668                                  LHS.getOperand(0).getValueType().isInteger());
1669      if (DCI.isBeforeLegalizeOps() ||
1670          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1671        return DAG.getSelectCC(SDLoc(N),
1672                               LHS.getOperand(0),
1673                               LHS.getOperand(1),
1674                               LHS.getOperand(2),
1675                               LHS.getOperand(3),
1676                               LHSCC);
1677      break;
1678    }
1679    }
1680    return SDValue();
1681  }
1682
1683  case AMDGPUISD::EXPORT: {
1684    SDValue Arg = N->getOperand(1);
1685    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1686      break;
1687
1688    SDValue NewArgs[8] = {
1689      N->getOperand(0), // Chain
1690      SDValue(),
1691      N->getOperand(2), // ArrayBase
1692      N->getOperand(3), // Type
1693      N->getOperand(4), // SWZ_X
1694      N->getOperand(5), // SWZ_Y
1695      N->getOperand(6), // SWZ_Z
1696      N->getOperand(7) // SWZ_W
1697    };
1698    SDLoc DL(N);
1699    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1700    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1701  }
1702  case AMDGPUISD::TEXTURE_FETCH: {
1703    SDValue Arg = N->getOperand(1);
1704    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1705      break;
1706
1707    SDValue NewArgs[19] = {
1708      N->getOperand(0),
1709      N->getOperand(1),
1710      N->getOperand(2),
1711      N->getOperand(3),
1712      N->getOperand(4),
1713      N->getOperand(5),
1714      N->getOperand(6),
1715      N->getOperand(7),
1716      N->getOperand(8),
1717      N->getOperand(9),
1718      N->getOperand(10),
1719      N->getOperand(11),
1720      N->getOperand(12),
1721      N->getOperand(13),
1722      N->getOperand(14),
1723      N->getOperand(15),
1724      N->getOperand(16),
1725      N->getOperand(17),
1726      N->getOperand(18),
1727    };
1728    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1729    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1730        NewArgs, 19);
1731  }
1732  }
1733  return SDValue();
1734}
1735
1736static bool
1737FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1738            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1739  const R600InstrInfo *TII =
1740      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1741  if (!Src.isMachineOpcode())
1742    return false;
1743  switch (Src.getMachineOpcode()) {
1744  case AMDGPU::FNEG_R600:
1745    if (!Neg.getNode())
1746      return false;
1747    Src = Src.getOperand(0);
1748    Neg = DAG.getTargetConstant(1, MVT::i32);
1749    return true;
1750  case AMDGPU::FABS_R600:
1751    if (!Abs.getNode())
1752      return false;
1753    Src = Src.getOperand(0);
1754    Abs = DAG.getTargetConstant(1, MVT::i32);
1755    return true;
1756  case AMDGPU::CONST_COPY: {
1757    unsigned Opcode = ParentNode->getMachineOpcode();
1758    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1759
1760    if (!Sel.getNode())
1761      return false;
1762
1763    SDValue CstOffset = Src.getOperand(0);
1764    if (ParentNode->getValueType(0).isVector())
1765      return false;
1766
1767    // Gather constants values
1768    int SrcIndices[] = {
1769      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1770      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1771      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1772      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1773      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1774      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1775      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1776      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1777      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1778      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1779      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1780    };
1781    std::vector<unsigned> Consts;
1782    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1783      int OtherSrcIdx = SrcIndices[i];
1784      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1785      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1786        continue;
1787      if (HasDst) {
1788        OtherSrcIdx--;
1789        OtherSelIdx--;
1790      }
1791      if (RegisterSDNode *Reg =
1792          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1793        if (Reg->getReg() == AMDGPU::ALU_CONST) {
1794          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1795              ParentNode->getOperand(OtherSelIdx));
1796          Consts.push_back(Cst->getZExtValue());
1797        }
1798      }
1799    }
1800
1801    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1802    Consts.push_back(Cst->getZExtValue());
1803    if (!TII->fitsConstReadLimitations(Consts)) {
1804      return false;
1805    }
1806
1807    Sel = CstOffset;
1808    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1809    return true;
1810  }
1811  case AMDGPU::MOV_IMM_I32:
1812  case AMDGPU::MOV_IMM_F32: {
1813    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1814    uint64_t ImmValue = 0;
1815
1816
1817    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1818      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1819      float FloatValue = FPC->getValueAPF().convertToFloat();
1820      if (FloatValue == 0.0) {
1821        ImmReg = AMDGPU::ZERO;
1822      } else if (FloatValue == 0.5) {
1823        ImmReg = AMDGPU::HALF;
1824      } else if (FloatValue == 1.0) {
1825        ImmReg = AMDGPU::ONE;
1826      } else {
1827        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1828      }
1829    } else {
1830      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1831      uint64_t Value = C->getZExtValue();
1832      if (Value == 0) {
1833        ImmReg = AMDGPU::ZERO;
1834      } else if (Value == 1) {
1835        ImmReg = AMDGPU::ONE_INT;
1836      } else {
1837        ImmValue = Value;
1838      }
1839    }
1840
1841    // Check that we aren't already using an immediate.
1842    // XXX: It's possible for an instruction to have more than one
1843    // immediate operand, but this is not supported yet.
1844    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1845      if (!Imm.getNode())
1846        return false;
1847      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1848      assert(C);
1849      if (C->getZExtValue())
1850        return false;
1851      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1852    }
1853    Src = DAG.getRegister(ImmReg, MVT::i32);
1854    return true;
1855  }
1856  default:
1857    return false;
1858  }
1859}
1860
1861
1862/// \brief Fold the instructions after selecting them
1863SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1864                                            SelectionDAG &DAG) const {
1865  const R600InstrInfo *TII =
1866      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1867  if (!Node->isMachineOpcode())
1868    return Node;
1869  unsigned Opcode = Node->getMachineOpcode();
1870  SDValue FakeOp;
1871
1872  std::vector<SDValue> Ops;
1873  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1874              I != E; ++I)
1875          Ops.push_back(*I);
1876
1877  if (Opcode == AMDGPU::DOT_4) {
1878    int OperandIdx[] = {
1879      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1880      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1881      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1882      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1883      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1884      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1885      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1886      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1887        };
1888    int NegIdx[] = {
1889      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1890      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1891      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1892      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1893      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1894      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1895      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1896      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1897    };
1898    int AbsIdx[] = {
1899      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1900      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1901      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1902      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1903      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1904      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1905      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1906      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1907    };
1908    for (unsigned i = 0; i < 8; i++) {
1909      if (OperandIdx[i] < 0)
1910        return Node;
1911      SDValue &Src = Ops[OperandIdx[i] - 1];
1912      SDValue &Neg = Ops[NegIdx[i] - 1];
1913      SDValue &Abs = Ops[AbsIdx[i] - 1];
1914      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1915      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1916      if (HasDst)
1917        SelIdx--;
1918      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1919      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1920        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1921    }
1922  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1923    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1924      SDValue &Src = Ops[i];
1925      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1926        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1927    }
1928  } else if (Opcode == AMDGPU::CLAMP_R600) {
1929    SDValue Src = Node->getOperand(0);
1930    if (!Src.isMachineOpcode() ||
1931        !TII->hasInstrModifiers(Src.getMachineOpcode()))
1932      return Node;
1933    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1934        AMDGPU::OpName::clamp);
1935    if (ClampIdx < 0)
1936      return Node;
1937    std::vector<SDValue> Ops;
1938    unsigned NumOp = Src.getNumOperands();
1939    for(unsigned i = 0; i < NumOp; ++i)
1940          Ops.push_back(Src.getOperand(i));
1941    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1942    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1943        Node->getVTList(), Ops);
1944  } else {
1945    if (!TII->hasInstrModifiers(Opcode))
1946      return Node;
1947    int OperandIdx[] = {
1948      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1949      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1950      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1951    };
1952    int NegIdx[] = {
1953      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1954      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1955      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1956    };
1957    int AbsIdx[] = {
1958      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1959      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1960      -1
1961    };
1962    for (unsigned i = 0; i < 3; i++) {
1963      if (OperandIdx[i] < 0)
1964        return Node;
1965      SDValue &Src = Ops[OperandIdx[i] - 1];
1966      SDValue &Neg = Ops[NegIdx[i] - 1];
1967      SDValue FakeAbs;
1968      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1969      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1970      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1971      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1972      if (HasDst) {
1973        SelIdx--;
1974        ImmIdx--;
1975      }
1976      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1977      SDValue &Imm = Ops[ImmIdx];
1978      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1979        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1980    }
1981  }
1982
1983  return Node;
1984}
1985