R600ExpandSpecialInstrs.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// Vector, Reduction, and Cube instructions need to fill the entire instruction
12/// group to work correctly.  This pass expands these individual instructions
13/// into several instructions that will completely fill the instruction group.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPU.h"
18#include "R600Defines.h"
19#include "R600InstrInfo.h"
20#include "R600MachineFunctionInfo.h"
21#include "R600RegisterInfo.h"
22#include "llvm/CodeGen/MachineFunctionPass.h"
23#include "llvm/CodeGen/MachineInstrBuilder.h"
24#include "llvm/CodeGen/MachineRegisterInfo.h"
25
26using namespace llvm;
27
28namespace {
29
30class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
31
32private:
33  static char ID;
34  const R600InstrInfo *TII;
35
36  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
37      unsigned Op);
38
39public:
40  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
41    TII(nullptr) { }
42
43  bool runOnMachineFunction(MachineFunction &MF) override;
44
45  const char *getPassName() const override {
46    return "R600 Expand special instructions pass";
47  }
48};
49
50} // End anonymous namespace
51
52char R600ExpandSpecialInstrsPass::ID = 0;
53
54FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
55  return new R600ExpandSpecialInstrsPass(TM);
56}
57
58void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
59    const MachineInstr *OldMI, unsigned Op) {
60  int OpIdx = TII->getOperandIdx(*OldMI, Op);
61  if (OpIdx > -1) {
62    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
63    TII->setImmOperand(NewMI, Op, Val);
64  }
65}
66
67bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
68  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
69
70  const R600RegisterInfo &TRI = TII->getRegisterInfo();
71
72  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
73                                                  BB != BB_E; ++BB) {
74    MachineBasicBlock &MBB = *BB;
75    MachineBasicBlock::iterator I = MBB.begin();
76    while (I != MBB.end()) {
77      MachineInstr &MI = *I;
78      I = std::next(I);
79
80      // Expand LDS_*_RET instructions
81      if (TII->isLDSRetInstr(MI.getOpcode())) {
82        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
83        assert(DstIdx != -1);
84        MachineOperand &DstOp = MI.getOperand(DstIdx);
85        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
86                                               DstOp.getReg(), AMDGPU::OQAP);
87        DstOp.setReg(AMDGPU::OQAP);
88        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
89                                           AMDGPU::OpName::pred_sel);
90        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
91                                           AMDGPU::OpName::pred_sel);
92        // Copy the pred_sel bit
93        Mov->getOperand(MovPredSelIdx).setReg(
94            MI.getOperand(LDSPredSelIdx).getReg());
95      }
96
97      switch (MI.getOpcode()) {
98      default: break;
99      // Expand PRED_X to one of the PRED_SET instructions.
100      case AMDGPU::PRED_X: {
101        uint64_t Flags = MI.getOperand(3).getImm();
102        // The native opcode used by PRED_X is stored as an immediate in the
103        // third operand.
104        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
105                                            MI.getOperand(2).getImm(), // opcode
106                                            MI.getOperand(0).getReg(), // dst
107                                            MI.getOperand(1).getReg(), // src0
108                                            AMDGPU::ZERO);             // src1
109        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
110        if (Flags & MO_FLAG_PUSH) {
111          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
112        } else {
113          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
114        }
115        MI.eraseFromParent();
116        continue;
117        }
118
119      case AMDGPU::INTERP_PAIR_XY: {
120        MachineInstr *BMI;
121        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
122                MI.getOperand(2).getImm());
123
124        for (unsigned Chan = 0; Chan < 4; ++Chan) {
125          unsigned DstReg;
126
127          if (Chan < 2)
128            DstReg = MI.getOperand(Chan).getReg();
129          else
130            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
131
132          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
133              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
134
135          if (Chan > 0) {
136            BMI->bundleWithPred();
137          }
138          if (Chan >= 2)
139            TII->addFlag(BMI, 0, MO_FLAG_MASK);
140          if (Chan != 3)
141            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
142        }
143
144        MI.eraseFromParent();
145        continue;
146        }
147
148      case AMDGPU::INTERP_PAIR_ZW: {
149        MachineInstr *BMI;
150        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
151                MI.getOperand(2).getImm());
152
153        for (unsigned Chan = 0; Chan < 4; ++Chan) {
154          unsigned DstReg;
155
156          if (Chan < 2)
157            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
158          else
159            DstReg = MI.getOperand(Chan-2).getReg();
160
161          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
162              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
163
164          if (Chan > 0) {
165            BMI->bundleWithPred();
166          }
167          if (Chan < 2)
168            TII->addFlag(BMI, 0, MO_FLAG_MASK);
169          if (Chan != 3)
170            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
171        }
172
173        MI.eraseFromParent();
174        continue;
175        }
176
177      case AMDGPU::INTERP_VEC_LOAD: {
178        const R600RegisterInfo &TRI = TII->getRegisterInfo();
179        MachineInstr *BMI;
180        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
181                MI.getOperand(1).getImm());
182        unsigned DstReg = MI.getOperand(0).getReg();
183
184        for (unsigned Chan = 0; Chan < 4; ++Chan) {
185          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
186              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
187          if (Chan > 0) {
188            BMI->bundleWithPred();
189          }
190          if (Chan != 3)
191            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
192        }
193
194        MI.eraseFromParent();
195        continue;
196        }
197      case AMDGPU::DOT_4: {
198
199        const R600RegisterInfo &TRI = TII->getRegisterInfo();
200
201        unsigned DstReg = MI.getOperand(0).getReg();
202        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
203
204        for (unsigned Chan = 0; Chan < 4; ++Chan) {
205          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
206          unsigned SubDstReg =
207              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
208          MachineInstr *BMI =
209              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
210          if (Chan > 0) {
211            BMI->bundleWithPred();
212          }
213          if (Mask) {
214            TII->addFlag(BMI, 0, MO_FLAG_MASK);
215          }
216          if (Chan != 3)
217            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
218          unsigned Opcode = BMI->getOpcode();
219          // While not strictly necessary from hw point of view, we force
220          // all src operands of a dot4 inst to belong to the same slot.
221          unsigned Src0 = BMI->getOperand(
222              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
223              .getReg();
224          unsigned Src1 = BMI->getOperand(
225              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
226              .getReg();
227          (void) Src0;
228          (void) Src1;
229          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
230              (TRI.getEncodingValue(Src1) & 0xff) < 127)
231            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
232        }
233        MI.eraseFromParent();
234        continue;
235      }
236      }
237
238      bool IsReduction = TII->isReductionOp(MI.getOpcode());
239      bool IsVector = TII->isVector(MI);
240      bool IsCube = TII->isCubeOp(MI.getOpcode());
241      if (!IsReduction && !IsVector && !IsCube) {
242        continue;
243      }
244
245      // Expand the instruction
246      //
247      // Reduction instructions:
248      // T0_X = DP4 T1_XYZW, T2_XYZW
249      // becomes:
250      // TO_X = DP4 T1_X, T2_X
251      // TO_Y (write masked) = DP4 T1_Y, T2_Y
252      // TO_Z (write masked) = DP4 T1_Z, T2_Z
253      // TO_W (write masked) = DP4 T1_W, T2_W
254      //
255      // Vector instructions:
256      // T0_X = MULLO_INT T1_X, T2_X
257      // becomes:
258      // T0_X = MULLO_INT T1_X, T2_X
259      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
260      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
261      // T0_W (write masked) = MULLO_INT T1_X, T2_X
262      //
263      // Cube instructions:
264      // T0_XYZW = CUBE T1_XYZW
265      // becomes:
266      // TO_X = CUBE T1_Z, T1_Y
267      // T0_Y = CUBE T1_Z, T1_X
268      // T0_Z = CUBE T1_X, T1_Z
269      // T0_W = CUBE T1_Y, T1_Z
270      for (unsigned Chan = 0; Chan < 4; Chan++) {
271        unsigned DstReg = MI.getOperand(
272                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
273        unsigned Src0 = MI.getOperand(
274                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
275        unsigned Src1 = 0;
276
277        // Determine the correct source registers
278        if (!IsCube) {
279          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
280          if (Src1Idx != -1) {
281            Src1 = MI.getOperand(Src1Idx).getReg();
282          }
283        }
284        if (IsReduction) {
285          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
286          Src0 = TRI.getSubReg(Src0, SubRegIndex);
287          Src1 = TRI.getSubReg(Src1, SubRegIndex);
288        } else if (IsCube) {
289          static const int CubeSrcSwz[] = {2, 2, 0, 1};
290          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
291          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
292          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
293          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
294        }
295
296        // Determine the correct destination registers;
297        bool Mask = false;
298        bool NotLast = true;
299        if (IsCube) {
300          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
301          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
302        } else {
303          // Mask the write if the original instruction does not write to
304          // the current Channel.
305          Mask = (Chan != TRI.getHWRegChan(DstReg));
306          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
307          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
308        }
309
310        // Set the IsLast bit
311        NotLast = (Chan != 3 );
312
313        // Add the new instruction
314        unsigned Opcode = MI.getOpcode();
315        switch (Opcode) {
316        case AMDGPU::CUBE_r600_pseudo:
317          Opcode = AMDGPU::CUBE_r600_real;
318          break;
319        case AMDGPU::CUBE_eg_pseudo:
320          Opcode = AMDGPU::CUBE_eg_real;
321          break;
322        default:
323          break;
324        }
325
326        MachineInstr *NewMI =
327          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
328
329        if (Chan != 0)
330          NewMI->bundleWithPred();
331        if (Mask) {
332          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
333        }
334        if (NotLast) {
335          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
336        }
337        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
338        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
339        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
340        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
341        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
342        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
343      }
344      MI.eraseFromParent();
345    }
346  }
347  return false;
348}
349