R600ISelLowering.cpp revision 29f1788de96cbf88ab87e3da130cf626b2e8e029
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/CallingConvLower.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/SelectionDAG.h"
24#include "llvm/IR/Argument.h"
25#include "llvm/IR/Function.h"
26
27using namespace llvm;
28
29R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
30    AMDGPUTargetLowering(TM),
31    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
32  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
34  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
35  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
36  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
37  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
38
39  computeRegisterProperties();
40
41  // Set condition code actions
42  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
43  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
44  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
45  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
46  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
47  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
48  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
49  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
50  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
54
55  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
56  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
57  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
58  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
59
60  setOperationAction(ISD::FCOS, MVT::f32, Custom);
61  setOperationAction(ISD::FSIN, MVT::f32, Custom);
62
63  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
64  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
65
66  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68
69  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70
71  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74
75  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
76  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
77
78  setOperationAction(ISD::SETCC, MVT::i32, Expand);
79  setOperationAction(ISD::SETCC, MVT::f32, Expand);
80  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
81
82  setOperationAction(ISD::SELECT, MVT::i32, Expand);
83  setOperationAction(ISD::SELECT, MVT::f32, Expand);
84  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
85  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
86  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
87  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
88
89  // Legalize loads and stores to the private address space.
90  setOperationAction(ISD::LOAD, MVT::i32, Custom);
91  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
92  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
93
94  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
95  // spaces, so it is custom lowered to handle those where it isn't.
96  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
97  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
98  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
99  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
100  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
101  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
102
103  setOperationAction(ISD::STORE, MVT::i8, Custom);
104  setOperationAction(ISD::STORE, MVT::i32, Custom);
105  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
106  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
107  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
108  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
109
110  setOperationAction(ISD::LOAD, MVT::i32, Custom);
111  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
112  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
113
114  setTargetDAGCombine(ISD::FP_ROUND);
115  setTargetDAGCombine(ISD::FP_TO_SINT);
116  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
117  setTargetDAGCombine(ISD::SELECT_CC);
118  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
119
120  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
121
122  setBooleanContents(ZeroOrNegativeOneBooleanContent);
123  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
124  setSchedulingPreference(Sched::Source);
125}
126
127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
128    MachineInstr * MI, MachineBasicBlock * BB) const {
129  MachineFunction * MF = BB->getParent();
130  MachineRegisterInfo &MRI = MF->getRegInfo();
131  MachineBasicBlock::iterator I = *MI;
132  const R600InstrInfo *TII =
133    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
134
135  switch (MI->getOpcode()) {
136  default:
137    if (TII->isLDSInstr(MI->getOpcode()) &&
138        TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) {
139      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
140      assert(DstIdx != -1);
141      MachineInstrBuilder NewMI;
142      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) {
143        NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()),
144                        AMDGPU::OQAP);
145        TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
146                                     MI->getOperand(0).getReg(),
147                                     AMDGPU::OQAP);
148      } else {
149        NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
150                        TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
151      }
152      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
153        NewMI.addOperand(MI->getOperand(i));
154      }
155    } else {
156      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
157    }
158    break;
159  case AMDGPU::CLAMP_R600: {
160    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
161                                                   AMDGPU::MOV,
162                                                   MI->getOperand(0).getReg(),
163                                                   MI->getOperand(1).getReg());
164    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
165    break;
166  }
167
168  case AMDGPU::FABS_R600: {
169    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
170                                                    AMDGPU::MOV,
171                                                    MI->getOperand(0).getReg(),
172                                                    MI->getOperand(1).getReg());
173    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
174    break;
175  }
176
177  case AMDGPU::FNEG_R600: {
178    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
179                                                    AMDGPU::MOV,
180                                                    MI->getOperand(0).getReg(),
181                                                    MI->getOperand(1).getReg());
182    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
183    break;
184  }
185
186  case AMDGPU::MASK_WRITE: {
187    unsigned maskedRegister = MI->getOperand(0).getReg();
188    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
189    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
190    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
191    break;
192  }
193
194  case AMDGPU::MOV_IMM_F32:
195    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
196                     MI->getOperand(1).getFPImm()->getValueAPF()
197                         .bitcastToAPInt().getZExtValue());
198    break;
199  case AMDGPU::MOV_IMM_I32:
200    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
201                     MI->getOperand(1).getImm());
202    break;
203  case AMDGPU::CONST_COPY: {
204    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
205        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
206    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
207        MI->getOperand(1).getImm());
208    break;
209  }
210
211  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
212  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
213  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
214    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
215
216    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
217            .addOperand(MI->getOperand(0))
218            .addOperand(MI->getOperand(1))
219            .addImm(EOP); // Set End of program bit
220    break;
221  }
222
223  case AMDGPU::TXD: {
224    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
225    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
226    MachineOperand &RID = MI->getOperand(4);
227    MachineOperand &SID = MI->getOperand(5);
228    unsigned TextureId = MI->getOperand(6).getImm();
229    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
230    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
231
232    switch (TextureId) {
233    case 5: // Rect
234      CTX = CTY = 0;
235      break;
236    case 6: // Shadow1D
237      SrcW = SrcZ;
238      break;
239    case 7: // Shadow2D
240      SrcW = SrcZ;
241      break;
242    case 8: // ShadowRect
243      CTX = CTY = 0;
244      SrcW = SrcZ;
245      break;
246    case 9: // 1DArray
247      SrcZ = SrcY;
248      CTZ = 0;
249      break;
250    case 10: // 2DArray
251      CTZ = 0;
252      break;
253    case 11: // Shadow1DArray
254      SrcZ = SrcY;
255      CTZ = 0;
256      break;
257    case 12: // Shadow2DArray
258      CTZ = 0;
259      break;
260    }
261    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
262            .addOperand(MI->getOperand(3))
263            .addImm(SrcX)
264            .addImm(SrcY)
265            .addImm(SrcZ)
266            .addImm(SrcW)
267            .addImm(0)
268            .addImm(0)
269            .addImm(0)
270            .addImm(0)
271            .addImm(1)
272            .addImm(2)
273            .addImm(3)
274            .addOperand(RID)
275            .addOperand(SID)
276            .addImm(CTX)
277            .addImm(CTY)
278            .addImm(CTZ)
279            .addImm(CTW);
280    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
281            .addOperand(MI->getOperand(2))
282            .addImm(SrcX)
283            .addImm(SrcY)
284            .addImm(SrcZ)
285            .addImm(SrcW)
286            .addImm(0)
287            .addImm(0)
288            .addImm(0)
289            .addImm(0)
290            .addImm(1)
291            .addImm(2)
292            .addImm(3)
293            .addOperand(RID)
294            .addOperand(SID)
295            .addImm(CTX)
296            .addImm(CTY)
297            .addImm(CTZ)
298            .addImm(CTW);
299    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
300            .addOperand(MI->getOperand(0))
301            .addOperand(MI->getOperand(1))
302            .addImm(SrcX)
303            .addImm(SrcY)
304            .addImm(SrcZ)
305            .addImm(SrcW)
306            .addImm(0)
307            .addImm(0)
308            .addImm(0)
309            .addImm(0)
310            .addImm(1)
311            .addImm(2)
312            .addImm(3)
313            .addOperand(RID)
314            .addOperand(SID)
315            .addImm(CTX)
316            .addImm(CTY)
317            .addImm(CTZ)
318            .addImm(CTW)
319            .addReg(T0, RegState::Implicit)
320            .addReg(T1, RegState::Implicit);
321    break;
322  }
323
324  case AMDGPU::TXD_SHADOW: {
325    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
326    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
327    MachineOperand &RID = MI->getOperand(4);
328    MachineOperand &SID = MI->getOperand(5);
329    unsigned TextureId = MI->getOperand(6).getImm();
330    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
331    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
332
333    switch (TextureId) {
334    case 5: // Rect
335      CTX = CTY = 0;
336      break;
337    case 6: // Shadow1D
338      SrcW = SrcZ;
339      break;
340    case 7: // Shadow2D
341      SrcW = SrcZ;
342      break;
343    case 8: // ShadowRect
344      CTX = CTY = 0;
345      SrcW = SrcZ;
346      break;
347    case 9: // 1DArray
348      SrcZ = SrcY;
349      CTZ = 0;
350      break;
351    case 10: // 2DArray
352      CTZ = 0;
353      break;
354    case 11: // Shadow1DArray
355      SrcZ = SrcY;
356      CTZ = 0;
357      break;
358    case 12: // Shadow2DArray
359      CTZ = 0;
360      break;
361    }
362
363    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
364            .addOperand(MI->getOperand(3))
365            .addImm(SrcX)
366            .addImm(SrcY)
367            .addImm(SrcZ)
368            .addImm(SrcW)
369            .addImm(0)
370            .addImm(0)
371            .addImm(0)
372            .addImm(0)
373            .addImm(1)
374            .addImm(2)
375            .addImm(3)
376            .addOperand(RID)
377            .addOperand(SID)
378            .addImm(CTX)
379            .addImm(CTY)
380            .addImm(CTZ)
381            .addImm(CTW);
382    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
383            .addOperand(MI->getOperand(2))
384            .addImm(SrcX)
385            .addImm(SrcY)
386            .addImm(SrcZ)
387            .addImm(SrcW)
388            .addImm(0)
389            .addImm(0)
390            .addImm(0)
391            .addImm(0)
392            .addImm(1)
393            .addImm(2)
394            .addImm(3)
395            .addOperand(RID)
396            .addOperand(SID)
397            .addImm(CTX)
398            .addImm(CTY)
399            .addImm(CTZ)
400            .addImm(CTW);
401    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
402            .addOperand(MI->getOperand(0))
403            .addOperand(MI->getOperand(1))
404            .addImm(SrcX)
405            .addImm(SrcY)
406            .addImm(SrcZ)
407            .addImm(SrcW)
408            .addImm(0)
409            .addImm(0)
410            .addImm(0)
411            .addImm(0)
412            .addImm(1)
413            .addImm(2)
414            .addImm(3)
415            .addOperand(RID)
416            .addOperand(SID)
417            .addImm(CTX)
418            .addImm(CTY)
419            .addImm(CTZ)
420            .addImm(CTW)
421            .addReg(T0, RegState::Implicit)
422            .addReg(T1, RegState::Implicit);
423    break;
424  }
425
426  case AMDGPU::BRANCH:
427      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
428              .addOperand(MI->getOperand(0));
429      break;
430
431  case AMDGPU::BRANCH_COND_f32: {
432    MachineInstr *NewMI =
433      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
434              AMDGPU::PREDICATE_BIT)
435              .addOperand(MI->getOperand(1))
436              .addImm(OPCODE_IS_NOT_ZERO)
437              .addImm(0); // Flags
438    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
439    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
440            .addOperand(MI->getOperand(0))
441            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
442    break;
443  }
444
445  case AMDGPU::BRANCH_COND_i32: {
446    MachineInstr *NewMI =
447      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
448            AMDGPU::PREDICATE_BIT)
449            .addOperand(MI->getOperand(1))
450            .addImm(OPCODE_IS_NOT_ZERO_INT)
451            .addImm(0); // Flags
452    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
453    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
454           .addOperand(MI->getOperand(0))
455            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
456    break;
457  }
458
459  case AMDGPU::EG_ExportSwz:
460  case AMDGPU::R600_ExportSwz: {
461    // Instruction is left unmodified if its not the last one of its type
462    bool isLastInstructionOfItsType = true;
463    unsigned InstExportType = MI->getOperand(1).getImm();
464    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
465         EndBlock = BB->end(); NextExportInst != EndBlock;
466         NextExportInst = llvm::next(NextExportInst)) {
467      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
468          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
469        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
470            .getImm();
471        if (CurrentInstExportType == InstExportType) {
472          isLastInstructionOfItsType = false;
473          break;
474        }
475      }
476    }
477    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
478    if (!EOP && !isLastInstructionOfItsType)
479      return BB;
480    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
481    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
482            .addOperand(MI->getOperand(0))
483            .addOperand(MI->getOperand(1))
484            .addOperand(MI->getOperand(2))
485            .addOperand(MI->getOperand(3))
486            .addOperand(MI->getOperand(4))
487            .addOperand(MI->getOperand(5))
488            .addOperand(MI->getOperand(6))
489            .addImm(CfInst)
490            .addImm(EOP);
491    break;
492  }
493  case AMDGPU::RETURN: {
494    // RETURN instructions must have the live-out registers as implicit uses,
495    // otherwise they appear dead.
496    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
497    MachineInstrBuilder MIB(*MF, MI);
498    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
499      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
500    return BB;
501  }
502  }
503
504  MI->eraseFromParent();
505  return BB;
506}
507
508//===----------------------------------------------------------------------===//
509// Custom DAG Lowering Operations
510//===----------------------------------------------------------------------===//
511
512SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
513  MachineFunction &MF = DAG.getMachineFunction();
514  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
515  switch (Op.getOpcode()) {
516  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
517  case ISD::FCOS:
518  case ISD::FSIN: return LowerTrig(Op, DAG);
519  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
520  case ISD::STORE: return LowerSTORE(Op, DAG);
521  case ISD::LOAD: return LowerLOAD(Op, DAG);
522  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
523  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
524  case ISD::INTRINSIC_VOID: {
525    SDValue Chain = Op.getOperand(0);
526    unsigned IntrinsicID =
527                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
528    switch (IntrinsicID) {
529    case AMDGPUIntrinsic::AMDGPU_store_output: {
530      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
531      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
532      MFI->LiveOuts.push_back(Reg);
533      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
534    }
535    case AMDGPUIntrinsic::R600_store_swizzle: {
536      const SDValue Args[8] = {
537        Chain,
538        Op.getOperand(2), // Export Value
539        Op.getOperand(3), // ArrayBase
540        Op.getOperand(4), // Type
541        DAG.getConstant(0, MVT::i32), // SWZ_X
542        DAG.getConstant(1, MVT::i32), // SWZ_Y
543        DAG.getConstant(2, MVT::i32), // SWZ_Z
544        DAG.getConstant(3, MVT::i32) // SWZ_W
545      };
546      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
547          Args, 8);
548    }
549
550    // default for switch(IntrinsicID)
551    default: break;
552    }
553    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
554    break;
555  }
556  case ISD::INTRINSIC_WO_CHAIN: {
557    unsigned IntrinsicID =
558                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
559    EVT VT = Op.getValueType();
560    SDLoc DL(Op);
561    switch(IntrinsicID) {
562    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
563    case AMDGPUIntrinsic::R600_load_input: {
564      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
565      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
566      MachineFunction &MF = DAG.getMachineFunction();
567      MachineRegisterInfo &MRI = MF.getRegInfo();
568      MRI.addLiveIn(Reg);
569      return DAG.getCopyFromReg(DAG.getEntryNode(),
570          SDLoc(DAG.getEntryNode()), Reg, VT);
571    }
572
573    case AMDGPUIntrinsic::R600_interp_input: {
574      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
575      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
576      MachineSDNode *interp;
577      if (ijb < 0) {
578        const MachineFunction &MF = DAG.getMachineFunction();
579        const R600InstrInfo *TII =
580          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
581        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
582            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
583        return DAG.getTargetExtractSubreg(
584            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
585            DL, MVT::f32, SDValue(interp, 0));
586      }
587      MachineFunction &MF = DAG.getMachineFunction();
588      MachineRegisterInfo &MRI = MF.getRegInfo();
589      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
590      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
591      MRI.addLiveIn(RegisterI);
592      MRI.addLiveIn(RegisterJ);
593      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
594          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
595      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
596          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
597
598      if (slot % 4 < 2)
599        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
600            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
601            RegisterJNode, RegisterINode);
602      else
603        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
604            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
605            RegisterJNode, RegisterINode);
606      return SDValue(interp, slot % 2);
607    }
608    case AMDGPUIntrinsic::R600_interp_xy:
609    case AMDGPUIntrinsic::R600_interp_zw: {
610      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
611      MachineSDNode *interp;
612      SDValue RegisterINode = Op.getOperand(2);
613      SDValue RegisterJNode = Op.getOperand(3);
614
615      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
616        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
617            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
618            RegisterJNode, RegisterINode);
619      else
620        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
621            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
622            RegisterJNode, RegisterINode);
623      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
624          SDValue(interp, 0), SDValue(interp, 1));
625    }
626    case AMDGPUIntrinsic::R600_tex:
627    case AMDGPUIntrinsic::R600_texc:
628    case AMDGPUIntrinsic::R600_txl:
629    case AMDGPUIntrinsic::R600_txlc:
630    case AMDGPUIntrinsic::R600_txb:
631    case AMDGPUIntrinsic::R600_txbc:
632    case AMDGPUIntrinsic::R600_txf:
633    case AMDGPUIntrinsic::R600_txq:
634    case AMDGPUIntrinsic::R600_ddx:
635    case AMDGPUIntrinsic::R600_ddy:
636    case AMDGPUIntrinsic::R600_ldptr: {
637      unsigned TextureOp;
638      switch (IntrinsicID) {
639      case AMDGPUIntrinsic::R600_tex:
640        TextureOp = 0;
641        break;
642      case AMDGPUIntrinsic::R600_texc:
643        TextureOp = 1;
644        break;
645      case AMDGPUIntrinsic::R600_txl:
646        TextureOp = 2;
647        break;
648      case AMDGPUIntrinsic::R600_txlc:
649        TextureOp = 3;
650        break;
651      case AMDGPUIntrinsic::R600_txb:
652        TextureOp = 4;
653        break;
654      case AMDGPUIntrinsic::R600_txbc:
655        TextureOp = 5;
656        break;
657      case AMDGPUIntrinsic::R600_txf:
658        TextureOp = 6;
659        break;
660      case AMDGPUIntrinsic::R600_txq:
661        TextureOp = 7;
662        break;
663      case AMDGPUIntrinsic::R600_ddx:
664        TextureOp = 8;
665        break;
666      case AMDGPUIntrinsic::R600_ddy:
667        TextureOp = 9;
668        break;
669      case AMDGPUIntrinsic::R600_ldptr:
670        TextureOp = 10;
671        break;
672      default:
673        llvm_unreachable("Unknow Texture Operation");
674      }
675
676      SDValue TexArgs[19] = {
677        DAG.getConstant(TextureOp, MVT::i32),
678        Op.getOperand(1),
679        DAG.getConstant(0, MVT::i32),
680        DAG.getConstant(1, MVT::i32),
681        DAG.getConstant(2, MVT::i32),
682        DAG.getConstant(3, MVT::i32),
683        Op.getOperand(2),
684        Op.getOperand(3),
685        Op.getOperand(4),
686        DAG.getConstant(0, MVT::i32),
687        DAG.getConstant(1, MVT::i32),
688        DAG.getConstant(2, MVT::i32),
689        DAG.getConstant(3, MVT::i32),
690        Op.getOperand(5),
691        Op.getOperand(6),
692        Op.getOperand(7),
693        Op.getOperand(8),
694        Op.getOperand(9),
695        Op.getOperand(10)
696      };
697      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
698    }
699    case AMDGPUIntrinsic::AMDGPU_dp4: {
700      SDValue Args[8] = {
701      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
702          DAG.getConstant(0, MVT::i32)),
703      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
704          DAG.getConstant(0, MVT::i32)),
705      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
706          DAG.getConstant(1, MVT::i32)),
707      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
708          DAG.getConstant(1, MVT::i32)),
709      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
710          DAG.getConstant(2, MVT::i32)),
711      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
712          DAG.getConstant(2, MVT::i32)),
713      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
714          DAG.getConstant(3, MVT::i32)),
715      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
716          DAG.getConstant(3, MVT::i32))
717      };
718      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
719    }
720
721    case Intrinsic::r600_read_ngroups_x:
722      return LowerImplicitParameter(DAG, VT, DL, 0);
723    case Intrinsic::r600_read_ngroups_y:
724      return LowerImplicitParameter(DAG, VT, DL, 1);
725    case Intrinsic::r600_read_ngroups_z:
726      return LowerImplicitParameter(DAG, VT, DL, 2);
727    case Intrinsic::r600_read_global_size_x:
728      return LowerImplicitParameter(DAG, VT, DL, 3);
729    case Intrinsic::r600_read_global_size_y:
730      return LowerImplicitParameter(DAG, VT, DL, 4);
731    case Intrinsic::r600_read_global_size_z:
732      return LowerImplicitParameter(DAG, VT, DL, 5);
733    case Intrinsic::r600_read_local_size_x:
734      return LowerImplicitParameter(DAG, VT, DL, 6);
735    case Intrinsic::r600_read_local_size_y:
736      return LowerImplicitParameter(DAG, VT, DL, 7);
737    case Intrinsic::r600_read_local_size_z:
738      return LowerImplicitParameter(DAG, VT, DL, 8);
739
740    case Intrinsic::r600_read_tgid_x:
741      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
742                                  AMDGPU::T1_X, VT);
743    case Intrinsic::r600_read_tgid_y:
744      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
745                                  AMDGPU::T1_Y, VT);
746    case Intrinsic::r600_read_tgid_z:
747      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
748                                  AMDGPU::T1_Z, VT);
749    case Intrinsic::r600_read_tidig_x:
750      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
751                                  AMDGPU::T0_X, VT);
752    case Intrinsic::r600_read_tidig_y:
753      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
754                                  AMDGPU::T0_Y, VT);
755    case Intrinsic::r600_read_tidig_z:
756      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
757                                  AMDGPU::T0_Z, VT);
758    }
759    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
760    break;
761  }
762  } // end switch(Op.getOpcode())
763  return SDValue();
764}
765
766void R600TargetLowering::ReplaceNodeResults(SDNode *N,
767                                            SmallVectorImpl<SDValue> &Results,
768                                            SelectionDAG &DAG) const {
769  switch (N->getOpcode()) {
770  default: return;
771  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
772    return;
773  case ISD::LOAD: {
774    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
775    Results.push_back(SDValue(Node, 0));
776    Results.push_back(SDValue(Node, 1));
777    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
778    // function
779    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
780    return;
781  }
782  case ISD::STORE:
783    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
784    Results.push_back(SDValue(Node, 0));
785    return;
786  }
787}
788
789SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
790  // On hw >= R700, COS/SIN input must be between -1. and 1.
791  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
792  EVT VT = Op.getValueType();
793  SDValue Arg = Op.getOperand(0);
794  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
795      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
796        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
797          DAG.getConstantFP(0.15915494309, MVT::f32)),
798        DAG.getConstantFP(0.5, MVT::f32)));
799  unsigned TrigNode;
800  switch (Op.getOpcode()) {
801  case ISD::FCOS:
802    TrigNode = AMDGPUISD::COS_HW;
803    break;
804  case ISD::FSIN:
805    TrigNode = AMDGPUISD::SIN_HW;
806    break;
807  default:
808    llvm_unreachable("Wrong trig opcode");
809  }
810  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
811      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
812        DAG.getConstantFP(-0.5, MVT::f32)));
813  if (Gen >= AMDGPUSubtarget::R700)
814    return TrigVal;
815  // On R600 hw, COS/SIN input must be between -Pi and Pi.
816  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
817      DAG.getConstantFP(3.14159265359, MVT::f32));
818}
819
820SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
821  return DAG.getNode(
822      ISD::SETCC,
823      SDLoc(Op),
824      MVT::i1,
825      Op, DAG.getConstantFP(0.0f, MVT::f32),
826      DAG.getCondCode(ISD::SETNE)
827      );
828}
829
830SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
831                                                   SDLoc DL,
832                                                   unsigned DwordOffset) const {
833  unsigned ByteOffset = DwordOffset * 4;
834  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
835                                      AMDGPUAS::CONSTANT_BUFFER_0);
836
837  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
838  assert(isInt<16>(ByteOffset));
839
840  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
841                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
842                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
843                     false, false, false, 0);
844}
845
846SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
847
848  MachineFunction &MF = DAG.getMachineFunction();
849  const AMDGPUFrameLowering *TFL =
850   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
851
852  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
853  assert(FIN);
854
855  unsigned FrameIndex = FIN->getIndex();
856  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
857  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
858}
859
860bool R600TargetLowering::isZero(SDValue Op) const {
861  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
862    return Cst->isNullValue();
863  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
864    return CstFP->isZero();
865  } else {
866    return false;
867  }
868}
869
870SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
871  SDLoc DL(Op);
872  EVT VT = Op.getValueType();
873
874  SDValue LHS = Op.getOperand(0);
875  SDValue RHS = Op.getOperand(1);
876  SDValue True = Op.getOperand(2);
877  SDValue False = Op.getOperand(3);
878  SDValue CC = Op.getOperand(4);
879  SDValue Temp;
880
881  // LHS and RHS are guaranteed to be the same value type
882  EVT CompareVT = LHS.getValueType();
883
884  // Check if we can lower this to a native operation.
885
886  // Try to lower to a SET* instruction:
887  //
888  // SET* can match the following patterns:
889  //
890  // select_cc f32, f32, -1,  0, cc_supported
891  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
892  // select_cc i32, i32, -1,  0, cc_supported
893  //
894
895  // Move hardware True/False values to the correct operand.
896  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
897  ISD::CondCode InverseCC =
898     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
899  if (isHWTrueValue(False) && isHWFalseValue(True)) {
900    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
901      std::swap(False, True);
902      CC = DAG.getCondCode(InverseCC);
903    } else {
904      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
905      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
906        std::swap(False, True);
907        std::swap(LHS, RHS);
908        CC = DAG.getCondCode(SwapInvCC);
909      }
910    }
911  }
912
913  if (isHWTrueValue(True) && isHWFalseValue(False) &&
914      (CompareVT == VT || VT == MVT::i32)) {
915    // This can be matched by a SET* instruction.
916    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
917  }
918
919  // Try to lower to a CND* instruction:
920  //
921  // CND* can match the following patterns:
922  //
923  // select_cc f32, 0.0, f32, f32, cc_supported
924  // select_cc f32, 0.0, i32, i32, cc_supported
925  // select_cc i32, 0,   f32, f32, cc_supported
926  // select_cc i32, 0,   i32, i32, cc_supported
927  //
928
929  // Try to move the zero value to the RHS
930  if (isZero(LHS)) {
931    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
932    // Try swapping the operands
933    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
934    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
935      std::swap(LHS, RHS);
936      CC = DAG.getCondCode(CCSwapped);
937    } else {
938      // Try inverting the conditon and then swapping the operands
939      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
940      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
941      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
942        std::swap(True, False);
943        std::swap(LHS, RHS);
944        CC = DAG.getCondCode(CCSwapped);
945      }
946    }
947  }
948  if (isZero(RHS)) {
949    SDValue Cond = LHS;
950    SDValue Zero = RHS;
951    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
952    if (CompareVT != VT) {
953      // Bitcast True / False to the correct types.  This will end up being
954      // a nop, but it allows us to define only a single pattern in the
955      // .TD files for each CND* instruction rather than having to have
956      // one pattern for integer True/False and one for fp True/False
957      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
958      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
959    }
960
961    switch (CCOpcode) {
962    case ISD::SETONE:
963    case ISD::SETUNE:
964    case ISD::SETNE:
965      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
966      Temp = True;
967      True = False;
968      False = Temp;
969      break;
970    default:
971      break;
972    }
973    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
974        Cond, Zero,
975        True, False,
976        DAG.getCondCode(CCOpcode));
977    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
978  }
979
980
981  // Possible Min/Max pattern
982  SDValue MinMax = LowerMinMax(Op, DAG);
983  if (MinMax.getNode()) {
984    return MinMax;
985  }
986
987  // If we make it this for it means we have no native instructions to handle
988  // this SELECT_CC, so we must lower it.
989  SDValue HWTrue, HWFalse;
990
991  if (CompareVT == MVT::f32) {
992    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
993    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
994  } else if (CompareVT == MVT::i32) {
995    HWTrue = DAG.getConstant(-1, CompareVT);
996    HWFalse = DAG.getConstant(0, CompareVT);
997  }
998  else {
999    assert(!"Unhandled value type in LowerSELECT_CC");
1000  }
1001
1002  // Lower this unsupported SELECT_CC into a combination of two supported
1003  // SELECT_CC operations.
1004  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1005
1006  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1007      Cond, HWFalse,
1008      True, False,
1009      DAG.getCondCode(ISD::SETNE));
1010}
1011
1012/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
1013/// convert these pointers to a register index.  Each register holds
1014/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1015/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1016/// for indirect addressing.
1017SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1018                                               unsigned StackWidth,
1019                                               SelectionDAG &DAG) const {
1020  unsigned SRLPad;
1021  switch(StackWidth) {
1022  case 1:
1023    SRLPad = 2;
1024    break;
1025  case 2:
1026    SRLPad = 3;
1027    break;
1028  case 4:
1029    SRLPad = 4;
1030    break;
1031  default: llvm_unreachable("Invalid stack width");
1032  }
1033
1034  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1035                     DAG.getConstant(SRLPad, MVT::i32));
1036}
1037
1038void R600TargetLowering::getStackAddress(unsigned StackWidth,
1039                                         unsigned ElemIdx,
1040                                         unsigned &Channel,
1041                                         unsigned &PtrIncr) const {
1042  switch (StackWidth) {
1043  default:
1044  case 1:
1045    Channel = 0;
1046    if (ElemIdx > 0) {
1047      PtrIncr = 1;
1048    } else {
1049      PtrIncr = 0;
1050    }
1051    break;
1052  case 2:
1053    Channel = ElemIdx % 2;
1054    if (ElemIdx == 2) {
1055      PtrIncr = 1;
1056    } else {
1057      PtrIncr = 0;
1058    }
1059    break;
1060  case 4:
1061    Channel = ElemIdx;
1062    PtrIncr = 0;
1063    break;
1064  }
1065}
1066
1067SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1068  SDLoc DL(Op);
1069  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1070  SDValue Chain = Op.getOperand(0);
1071  SDValue Value = Op.getOperand(1);
1072  SDValue Ptr = Op.getOperand(2);
1073
1074  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1075  if (Result.getNode()) {
1076    return Result;
1077  }
1078
1079  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1080    if (StoreNode->isTruncatingStore()) {
1081      EVT VT = Value.getValueType();
1082      assert(VT.bitsLE(MVT::i32));
1083      EVT MemVT = StoreNode->getMemoryVT();
1084      SDValue MaskConstant;
1085      if (MemVT == MVT::i8) {
1086        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1087      } else {
1088        assert(MemVT == MVT::i16);
1089        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1090      }
1091      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1092                                      DAG.getConstant(2, MVT::i32));
1093      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1094                                      DAG.getConstant(0x00000003, VT));
1095      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1096      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1097                                   DAG.getConstant(3, VT));
1098      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1099      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1100      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1101      // vector instead.
1102      SDValue Src[4] = {
1103        ShiftedValue,
1104        DAG.getConstant(0, MVT::i32),
1105        DAG.getConstant(0, MVT::i32),
1106        Mask
1107      };
1108      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1109      SDValue Args[3] = { Chain, Input, DWordAddr };
1110      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1111                                     Op->getVTList(), Args, 3, MemVT,
1112                                     StoreNode->getMemOperand());
1113    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1114               Value.getValueType().bitsGE(MVT::i32)) {
1115      // Convert pointer from byte address to dword address.
1116      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1117                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1118                                    Ptr, DAG.getConstant(2, MVT::i32)));
1119
1120      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1121        assert(!"Truncated and indexed stores not supported yet");
1122      } else {
1123        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1124      }
1125      return Chain;
1126    }
1127  }
1128
1129  EVT ValueVT = Value.getValueType();
1130
1131  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1132    return SDValue();
1133  }
1134
1135  // Lowering for indirect addressing
1136
1137  const MachineFunction &MF = DAG.getMachineFunction();
1138  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1139                                         getTargetMachine().getFrameLowering());
1140  unsigned StackWidth = TFL->getStackWidth(MF);
1141
1142  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1143
1144  if (ValueVT.isVector()) {
1145    unsigned NumElemVT = ValueVT.getVectorNumElements();
1146    EVT ElemVT = ValueVT.getVectorElementType();
1147    SDValue Stores[4];
1148
1149    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1150                                      "vector width in load");
1151
1152    for (unsigned i = 0; i < NumElemVT; ++i) {
1153      unsigned Channel, PtrIncr;
1154      getStackAddress(StackWidth, i, Channel, PtrIncr);
1155      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1156                        DAG.getConstant(PtrIncr, MVT::i32));
1157      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1158                                 Value, DAG.getConstant(i, MVT::i32));
1159
1160      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1161                              Chain, Elem, Ptr,
1162                              DAG.getTargetConstant(Channel, MVT::i32));
1163    }
1164     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1165   } else {
1166    if (ValueVT == MVT::i8) {
1167      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1168    }
1169    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1170    DAG.getTargetConstant(0, MVT::i32)); // Channel
1171  }
1172
1173  return Chain;
1174}
1175
1176// return (512 + (kc_bank << 12)
1177static int
1178ConstantAddressBlock(unsigned AddressSpace) {
1179  switch (AddressSpace) {
1180  case AMDGPUAS::CONSTANT_BUFFER_0:
1181    return 512;
1182  case AMDGPUAS::CONSTANT_BUFFER_1:
1183    return 512 + 4096;
1184  case AMDGPUAS::CONSTANT_BUFFER_2:
1185    return 512 + 4096 * 2;
1186  case AMDGPUAS::CONSTANT_BUFFER_3:
1187    return 512 + 4096 * 3;
1188  case AMDGPUAS::CONSTANT_BUFFER_4:
1189    return 512 + 4096 * 4;
1190  case AMDGPUAS::CONSTANT_BUFFER_5:
1191    return 512 + 4096 * 5;
1192  case AMDGPUAS::CONSTANT_BUFFER_6:
1193    return 512 + 4096 * 6;
1194  case AMDGPUAS::CONSTANT_BUFFER_7:
1195    return 512 + 4096 * 7;
1196  case AMDGPUAS::CONSTANT_BUFFER_8:
1197    return 512 + 4096 * 8;
1198  case AMDGPUAS::CONSTANT_BUFFER_9:
1199    return 512 + 4096 * 9;
1200  case AMDGPUAS::CONSTANT_BUFFER_10:
1201    return 512 + 4096 * 10;
1202  case AMDGPUAS::CONSTANT_BUFFER_11:
1203    return 512 + 4096 * 11;
1204  case AMDGPUAS::CONSTANT_BUFFER_12:
1205    return 512 + 4096 * 12;
1206  case AMDGPUAS::CONSTANT_BUFFER_13:
1207    return 512 + 4096 * 13;
1208  case AMDGPUAS::CONSTANT_BUFFER_14:
1209    return 512 + 4096 * 14;
1210  case AMDGPUAS::CONSTANT_BUFFER_15:
1211    return 512 + 4096 * 15;
1212  default:
1213    return -1;
1214  }
1215}
1216
1217SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1218{
1219  EVT VT = Op.getValueType();
1220  SDLoc DL(Op);
1221  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1222  SDValue Chain = Op.getOperand(0);
1223  SDValue Ptr = Op.getOperand(1);
1224  SDValue LoweredLoad;
1225
1226  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1227    SDValue MergedValues[2] = {
1228      SplitVectorLoad(Op, DAG),
1229      Chain
1230    };
1231    return DAG.getMergeValues(MergedValues, 2, DL);
1232  }
1233
1234  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1235  if (ConstantBlock > -1 &&
1236      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1237       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1238    SDValue Result;
1239    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1240        isa<Constant>(LoadNode->getSrcValue()) ||
1241        isa<ConstantSDNode>(Ptr)) {
1242      SDValue Slots[4];
1243      for (unsigned i = 0; i < 4; i++) {
1244        // We want Const position encoded with the following formula :
1245        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1246        // const_index is Ptr computed by llvm using an alignment of 16.
1247        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1248        // then div by 4 at the ISel step
1249        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1250            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1251        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1252      }
1253      EVT NewVT = MVT::v4i32;
1254      unsigned NumElements = 4;
1255      if (VT.isVector()) {
1256        NewVT = VT;
1257        NumElements = VT.getVectorNumElements();
1258      }
1259      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1260    } else {
1261      // non constant ptr cant be folded, keeps it as a v4f32 load
1262      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1263          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1264          DAG.getConstant(LoadNode->getAddressSpace() -
1265                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1266          );
1267    }
1268
1269    if (!VT.isVector()) {
1270      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1271          DAG.getConstant(0, MVT::i32));
1272    }
1273
1274    SDValue MergedValues[2] = {
1275        Result,
1276        Chain
1277    };
1278    return DAG.getMergeValues(MergedValues, 2, DL);
1279  }
1280
1281  // For most operations returning SDValue() will result in the node being
1282  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1283  // need to manually expand loads that may be legal in some address spaces and
1284  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1285  // compute shaders, since the data is sign extended when it is uploaded to the
1286  // buffer. However SEXT loads from other address spaces are not supported, so
1287  // we need to expand them here.
1288  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1289    EVT MemVT = LoadNode->getMemoryVT();
1290    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1291    SDValue ShiftAmount =
1292          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1293    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1294                                  LoadNode->getPointerInfo(), MemVT,
1295                                  LoadNode->isVolatile(),
1296                                  LoadNode->isNonTemporal(),
1297                                  LoadNode->getAlignment());
1298    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1299    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1300
1301    SDValue MergedValues[2] = { Sra, Chain };
1302    return DAG.getMergeValues(MergedValues, 2, DL);
1303  }
1304
1305  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1306    return SDValue();
1307  }
1308
1309  // Lowering for indirect addressing
1310  const MachineFunction &MF = DAG.getMachineFunction();
1311  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1312                                         getTargetMachine().getFrameLowering());
1313  unsigned StackWidth = TFL->getStackWidth(MF);
1314
1315  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1316
1317  if (VT.isVector()) {
1318    unsigned NumElemVT = VT.getVectorNumElements();
1319    EVT ElemVT = VT.getVectorElementType();
1320    SDValue Loads[4];
1321
1322    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1323                                      "vector width in load");
1324
1325    for (unsigned i = 0; i < NumElemVT; ++i) {
1326      unsigned Channel, PtrIncr;
1327      getStackAddress(StackWidth, i, Channel, PtrIncr);
1328      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1329                        DAG.getConstant(PtrIncr, MVT::i32));
1330      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1331                             Chain, Ptr,
1332                             DAG.getTargetConstant(Channel, MVT::i32),
1333                             Op.getOperand(2));
1334    }
1335    for (unsigned i = NumElemVT; i < 4; ++i) {
1336      Loads[i] = DAG.getUNDEF(ElemVT);
1337    }
1338    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1339    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1340  } else {
1341    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1342                              Chain, Ptr,
1343                              DAG.getTargetConstant(0, MVT::i32), // Channel
1344                              Op.getOperand(2));
1345  }
1346
1347  SDValue Ops[2];
1348  Ops[0] = LoweredLoad;
1349  Ops[1] = Chain;
1350
1351  return DAG.getMergeValues(Ops, 2, DL);
1352}
1353
1354/// XXX Only kernel functions are supported, so we can assume for now that
1355/// every function is a kernel function, but in the future we should use
1356/// separate calling conventions for kernel and non-kernel functions.
1357SDValue R600TargetLowering::LowerFormalArguments(
1358                                      SDValue Chain,
1359                                      CallingConv::ID CallConv,
1360                                      bool isVarArg,
1361                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1362                                      SDLoc DL, SelectionDAG &DAG,
1363                                      SmallVectorImpl<SDValue> &InVals) const {
1364  SmallVector<CCValAssign, 16> ArgLocs;
1365  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1366                 getTargetMachine(), ArgLocs, *DAG.getContext());
1367  MachineFunction &MF = DAG.getMachineFunction();
1368  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1369
1370  SmallVector<ISD::InputArg, 8> LocalIns;
1371
1372  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1373                          LocalIns);
1374
1375  AnalyzeFormalArguments(CCInfo, LocalIns);
1376
1377  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1378    CCValAssign &VA = ArgLocs[i];
1379    EVT VT = Ins[i].VT;
1380    EVT MemVT = LocalIns[i].VT;
1381
1382    if (ShaderType != ShaderType::COMPUTE) {
1383      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1384      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1385      InVals.push_back(Register);
1386      continue;
1387    }
1388
1389    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1390                                                   AMDGPUAS::CONSTANT_BUFFER_0);
1391
1392    // The first 36 bytes of the input buffer contains information about
1393    // thread group and global sizes.
1394    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1395                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1396                                 MachinePointerInfo(UndefValue::get(PtrTy)),
1397                                 MemVT, false, false, 4);
1398                                 // 4 is the prefered alignment for
1399                                 // the CONSTANT memory space.
1400    InVals.push_back(Arg);
1401  }
1402  return Chain;
1403}
1404
1405EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1406   if (!VT.isVector()) return MVT::i32;
1407   return VT.changeVectorElementTypeToInteger();
1408}
1409
1410static SDValue
1411CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1412                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
1413  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1414  assert(RemapSwizzle.empty());
1415  SDValue NewBldVec[4] = {
1416      VectorEntry.getOperand(0),
1417      VectorEntry.getOperand(1),
1418      VectorEntry.getOperand(2),
1419      VectorEntry.getOperand(3)
1420  };
1421
1422  for (unsigned i = 0; i < 4; i++) {
1423    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1424      // We mask write here to teach later passes that the ith element of this
1425      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1426      // break false dependencies and additionnaly make assembly easier to read.
1427      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1428    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1429      if (C->isZero()) {
1430        RemapSwizzle[i] = 4; // SEL_0
1431        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1432      } else if (C->isExactlyValue(1.0)) {
1433        RemapSwizzle[i] = 5; // SEL_1
1434        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1435      }
1436    }
1437
1438    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1439      continue;
1440    for (unsigned j = 0; j < i; j++) {
1441      if (NewBldVec[i] == NewBldVec[j]) {
1442        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1443        RemapSwizzle[i] = j;
1444        break;
1445      }
1446    }
1447  }
1448
1449  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1450      VectorEntry.getValueType(), NewBldVec, 4);
1451}
1452
1453static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1454                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1455  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1456  assert(RemapSwizzle.empty());
1457  SDValue NewBldVec[4] = {
1458      VectorEntry.getOperand(0),
1459      VectorEntry.getOperand(1),
1460      VectorEntry.getOperand(2),
1461      VectorEntry.getOperand(3)
1462  };
1463  bool isUnmovable[4] = { false, false, false, false };
1464  for (unsigned i = 0; i < 4; i++)
1465    RemapSwizzle[i] = i;
1466
1467  for (unsigned i = 0; i < 4; i++) {
1468    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1469      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1470          ->getZExtValue();
1471      if (i == Idx) {
1472        isUnmovable[Idx] = true;
1473        continue;
1474      }
1475      if (isUnmovable[Idx])
1476        continue;
1477      // Swap i and Idx
1478      std::swap(NewBldVec[Idx], NewBldVec[i]);
1479      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1480      break;
1481    }
1482  }
1483
1484  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1485      VectorEntry.getValueType(), NewBldVec, 4);
1486}
1487
1488
1489SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1490SDValue Swz[4], SelectionDAG &DAG) const {
1491  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1492  // Old -> New swizzle values
1493  DenseMap<unsigned, unsigned> SwizzleRemap;
1494
1495  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1496  for (unsigned i = 0; i < 4; i++) {
1497    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1498    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1499      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1500  }
1501
1502  SwizzleRemap.clear();
1503  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1504  for (unsigned i = 0; i < 4; i++) {
1505    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1506    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1507      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1508  }
1509
1510  return BuildVector;
1511}
1512
1513
1514//===----------------------------------------------------------------------===//
1515// Custom DAG Optimizations
1516//===----------------------------------------------------------------------===//
1517
1518SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1519                                              DAGCombinerInfo &DCI) const {
1520  SelectionDAG &DAG = DCI.DAG;
1521
1522  switch (N->getOpcode()) {
1523  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1524  case ISD::FP_ROUND: {
1525      SDValue Arg = N->getOperand(0);
1526      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1527        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1528                           Arg.getOperand(0));
1529      }
1530      break;
1531    }
1532
1533  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1534  // (i32 select_cc f32, f32, -1, 0 cc)
1535  //
1536  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1537  // this to one of the SET*_DX10 instructions.
1538  case ISD::FP_TO_SINT: {
1539    SDValue FNeg = N->getOperand(0);
1540    if (FNeg.getOpcode() != ISD::FNEG) {
1541      return SDValue();
1542    }
1543    SDValue SelectCC = FNeg.getOperand(0);
1544    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1545        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1546        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1547        !isHWTrueValue(SelectCC.getOperand(2)) ||
1548        !isHWFalseValue(SelectCC.getOperand(3))) {
1549      return SDValue();
1550    }
1551
1552    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1553                           SelectCC.getOperand(0), // LHS
1554                           SelectCC.getOperand(1), // RHS
1555                           DAG.getConstant(-1, MVT::i32), // True
1556                           DAG.getConstant(0, MVT::i32),  // Flase
1557                           SelectCC.getOperand(4)); // CC
1558
1559    break;
1560  }
1561
1562  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1563  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1564  case ISD::INSERT_VECTOR_ELT: {
1565    SDValue InVec = N->getOperand(0);
1566    SDValue InVal = N->getOperand(1);
1567    SDValue EltNo = N->getOperand(2);
1568    SDLoc dl(N);
1569
1570    // If the inserted element is an UNDEF, just use the input vector.
1571    if (InVal.getOpcode() == ISD::UNDEF)
1572      return InVec;
1573
1574    EVT VT = InVec.getValueType();
1575
1576    // If we can't generate a legal BUILD_VECTOR, exit
1577    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1578      return SDValue();
1579
1580    // Check that we know which element is being inserted
1581    if (!isa<ConstantSDNode>(EltNo))
1582      return SDValue();
1583    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1584
1585    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1586    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1587    // vector elements.
1588    SmallVector<SDValue, 8> Ops;
1589    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1590      Ops.append(InVec.getNode()->op_begin(),
1591                 InVec.getNode()->op_end());
1592    } else if (InVec.getOpcode() == ISD::UNDEF) {
1593      unsigned NElts = VT.getVectorNumElements();
1594      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1595    } else {
1596      return SDValue();
1597    }
1598
1599    // Insert the element
1600    if (Elt < Ops.size()) {
1601      // All the operands of BUILD_VECTOR must have the same type;
1602      // we enforce that here.
1603      EVT OpVT = Ops[0].getValueType();
1604      if (InVal.getValueType() != OpVT)
1605        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1606          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1607          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1608      Ops[Elt] = InVal;
1609    }
1610
1611    // Return the new vector
1612    return DAG.getNode(ISD::BUILD_VECTOR, dl,
1613                       VT, &Ops[0], Ops.size());
1614  }
1615
1616  // Extract_vec (Build_vector) generated by custom lowering
1617  // also needs to be customly combined
1618  case ISD::EXTRACT_VECTOR_ELT: {
1619    SDValue Arg = N->getOperand(0);
1620    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1621      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1622        unsigned Element = Const->getZExtValue();
1623        return Arg->getOperand(Element);
1624      }
1625    }
1626    if (Arg.getOpcode() == ISD::BITCAST &&
1627        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1628      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1629        unsigned Element = Const->getZExtValue();
1630        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1631            Arg->getOperand(0).getOperand(Element));
1632      }
1633    }
1634  }
1635
1636  case ISD::SELECT_CC: {
1637    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1638    //      selectcc x, y, a, b, inv(cc)
1639    //
1640    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1641    //      selectcc x, y, a, b, cc
1642    SDValue LHS = N->getOperand(0);
1643    if (LHS.getOpcode() != ISD::SELECT_CC) {
1644      return SDValue();
1645    }
1646
1647    SDValue RHS = N->getOperand(1);
1648    SDValue True = N->getOperand(2);
1649    SDValue False = N->getOperand(3);
1650    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1651
1652    if (LHS.getOperand(2).getNode() != True.getNode() ||
1653        LHS.getOperand(3).getNode() != False.getNode() ||
1654        RHS.getNode() != False.getNode()) {
1655      return SDValue();
1656    }
1657
1658    switch (NCC) {
1659    default: return SDValue();
1660    case ISD::SETNE: return LHS;
1661    case ISD::SETEQ: {
1662      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1663      LHSCC = ISD::getSetCCInverse(LHSCC,
1664                                  LHS.getOperand(0).getValueType().isInteger());
1665      if (DCI.isBeforeLegalizeOps() ||
1666          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1667        return DAG.getSelectCC(SDLoc(N),
1668                               LHS.getOperand(0),
1669                               LHS.getOperand(1),
1670                               LHS.getOperand(2),
1671                               LHS.getOperand(3),
1672                               LHSCC);
1673      break;
1674    }
1675    }
1676    return SDValue();
1677  }
1678
1679  case AMDGPUISD::EXPORT: {
1680    SDValue Arg = N->getOperand(1);
1681    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1682      break;
1683
1684    SDValue NewArgs[8] = {
1685      N->getOperand(0), // Chain
1686      SDValue(),
1687      N->getOperand(2), // ArrayBase
1688      N->getOperand(3), // Type
1689      N->getOperand(4), // SWZ_X
1690      N->getOperand(5), // SWZ_Y
1691      N->getOperand(6), // SWZ_Z
1692      N->getOperand(7) // SWZ_W
1693    };
1694    SDLoc DL(N);
1695    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1696    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1697  }
1698  case AMDGPUISD::TEXTURE_FETCH: {
1699    SDValue Arg = N->getOperand(1);
1700    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1701      break;
1702
1703    SDValue NewArgs[19] = {
1704      N->getOperand(0),
1705      N->getOperand(1),
1706      N->getOperand(2),
1707      N->getOperand(3),
1708      N->getOperand(4),
1709      N->getOperand(5),
1710      N->getOperand(6),
1711      N->getOperand(7),
1712      N->getOperand(8),
1713      N->getOperand(9),
1714      N->getOperand(10),
1715      N->getOperand(11),
1716      N->getOperand(12),
1717      N->getOperand(13),
1718      N->getOperand(14),
1719      N->getOperand(15),
1720      N->getOperand(16),
1721      N->getOperand(17),
1722      N->getOperand(18),
1723    };
1724    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1725    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1726        NewArgs, 19);
1727  }
1728  }
1729  return SDValue();
1730}
1731
1732static bool
1733FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1734            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1735  const R600InstrInfo *TII =
1736      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1737  if (!Src.isMachineOpcode())
1738    return false;
1739  switch (Src.getMachineOpcode()) {
1740  case AMDGPU::FNEG_R600:
1741    if (!Neg.getNode())
1742      return false;
1743    Src = Src.getOperand(0);
1744    Neg = DAG.getTargetConstant(1, MVT::i32);
1745    return true;
1746  case AMDGPU::FABS_R600:
1747    if (!Abs.getNode())
1748      return false;
1749    Src = Src.getOperand(0);
1750    Abs = DAG.getTargetConstant(1, MVT::i32);
1751    return true;
1752  case AMDGPU::CONST_COPY: {
1753    unsigned Opcode = ParentNode->getMachineOpcode();
1754    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1755
1756    if (!Sel.getNode())
1757      return false;
1758
1759    SDValue CstOffset = Src.getOperand(0);
1760    if (ParentNode->getValueType(0).isVector())
1761      return false;
1762
1763    // Gather constants values
1764    int SrcIndices[] = {
1765      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1766      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1767      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1768      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1769      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1770      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1771      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1772      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1773      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1774      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1775      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1776    };
1777    std::vector<unsigned> Consts;
1778    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1779      int OtherSrcIdx = SrcIndices[i];
1780      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1781      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1782        continue;
1783      if (HasDst) {
1784        OtherSrcIdx--;
1785        OtherSelIdx--;
1786      }
1787      if (RegisterSDNode *Reg =
1788          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1789        if (Reg->getReg() == AMDGPU::ALU_CONST) {
1790          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1791              ParentNode->getOperand(OtherSelIdx));
1792          Consts.push_back(Cst->getZExtValue());
1793        }
1794      }
1795    }
1796
1797    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1798    Consts.push_back(Cst->getZExtValue());
1799    if (!TII->fitsConstReadLimitations(Consts)) {
1800      return false;
1801    }
1802
1803    Sel = CstOffset;
1804    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1805    return true;
1806  }
1807  case AMDGPU::MOV_IMM_I32:
1808  case AMDGPU::MOV_IMM_F32: {
1809    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1810    uint64_t ImmValue = 0;
1811
1812
1813    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1814      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1815      float FloatValue = FPC->getValueAPF().convertToFloat();
1816      if (FloatValue == 0.0) {
1817        ImmReg = AMDGPU::ZERO;
1818      } else if (FloatValue == 0.5) {
1819        ImmReg = AMDGPU::HALF;
1820      } else if (FloatValue == 1.0) {
1821        ImmReg = AMDGPU::ONE;
1822      } else {
1823        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1824      }
1825    } else {
1826      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1827      uint64_t Value = C->getZExtValue();
1828      if (Value == 0) {
1829        ImmReg = AMDGPU::ZERO;
1830      } else if (Value == 1) {
1831        ImmReg = AMDGPU::ONE_INT;
1832      } else {
1833        ImmValue = Value;
1834      }
1835    }
1836
1837    // Check that we aren't already using an immediate.
1838    // XXX: It's possible for an instruction to have more than one
1839    // immediate operand, but this is not supported yet.
1840    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1841      if (!Imm.getNode())
1842        return false;
1843      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1844      assert(C);
1845      if (C->getZExtValue())
1846        return false;
1847      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1848    }
1849    Src = DAG.getRegister(ImmReg, MVT::i32);
1850    return true;
1851  }
1852  default:
1853    return false;
1854  }
1855}
1856
1857
1858/// \brief Fold the instructions after selecting them
1859SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1860                                            SelectionDAG &DAG) const {
1861  const R600InstrInfo *TII =
1862      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1863  if (!Node->isMachineOpcode())
1864    return Node;
1865  unsigned Opcode = Node->getMachineOpcode();
1866  SDValue FakeOp;
1867
1868  std::vector<SDValue> Ops;
1869  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1870              I != E; ++I)
1871          Ops.push_back(*I);
1872
1873  if (Opcode == AMDGPU::DOT_4) {
1874    int OperandIdx[] = {
1875      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1876      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1877      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1878      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1879      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1880      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1881      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1882      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1883        };
1884    int NegIdx[] = {
1885      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1886      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1887      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1888      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1889      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1890      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1891      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1892      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1893    };
1894    int AbsIdx[] = {
1895      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1896      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1897      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1898      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1899      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1900      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1901      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1902      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1903    };
1904    for (unsigned i = 0; i < 8; i++) {
1905      if (OperandIdx[i] < 0)
1906        return Node;
1907      SDValue &Src = Ops[OperandIdx[i] - 1];
1908      SDValue &Neg = Ops[NegIdx[i] - 1];
1909      SDValue &Abs = Ops[AbsIdx[i] - 1];
1910      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1911      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1912      if (HasDst)
1913        SelIdx--;
1914      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1915      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1916        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1917    }
1918  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1919    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1920      SDValue &Src = Ops[i];
1921      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1922        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1923    }
1924  } else if (Opcode == AMDGPU::CLAMP_R600) {
1925    SDValue Src = Node->getOperand(0);
1926    if (!Src.isMachineOpcode() ||
1927        !TII->hasInstrModifiers(Src.getMachineOpcode()))
1928      return Node;
1929    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1930        AMDGPU::OpName::clamp);
1931    if (ClampIdx < 0)
1932      return Node;
1933    std::vector<SDValue> Ops;
1934    unsigned NumOp = Src.getNumOperands();
1935    for(unsigned i = 0; i < NumOp; ++i)
1936          Ops.push_back(Src.getOperand(i));
1937    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1938    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1939        Node->getVTList(), Ops);
1940  } else {
1941    if (!TII->hasInstrModifiers(Opcode))
1942      return Node;
1943    int OperandIdx[] = {
1944      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1945      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1946      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1947    };
1948    int NegIdx[] = {
1949      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1950      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1951      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1952    };
1953    int AbsIdx[] = {
1954      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1955      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1956      -1
1957    };
1958    for (unsigned i = 0; i < 3; i++) {
1959      if (OperandIdx[i] < 0)
1960        return Node;
1961      SDValue &Src = Ops[OperandIdx[i] - 1];
1962      SDValue &Neg = Ops[NegIdx[i] - 1];
1963      SDValue FakeAbs;
1964      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1965      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1966      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1967      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1968      if (HasDst) {
1969        SelIdx--;
1970        ImmIdx--;
1971      }
1972      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1973      SDValue &Imm = Ops[ImmIdx];
1974      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1975        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1976    }
1977  }
1978
1979  return Node;
1980}
1981