R600ExpandSpecialInstrs.cpp revision de2d8694e25a814696358e95141f4b1aa4d8847e
1//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// Vector, Reduction, and Cube instructions need to fill the entire instruction
12/// group to work correctly.  This pass expands these individual instructions
13/// into several instructions that will completely fill the instruction group.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPU.h"
18#include "R600Defines.h"
19#include "R600InstrInfo.h"
20#include "R600MachineFunctionInfo.h"
21#include "R600RegisterInfo.h"
22#include "AMDGPUSubtarget.h"
23#include "llvm/CodeGen/MachineFunctionPass.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineRegisterInfo.h"
26
27using namespace llvm;
28
29namespace {
30
31class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
32private:
33  static char ID;
34  const R600InstrInfo *TII;
35
36  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
37      unsigned Op);
38
39public:
40  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
41    TII(nullptr) { }
42
43  bool runOnMachineFunction(MachineFunction &MF) override;
44
45  const char *getPassName() const override {
46    return "R600 Expand special instructions pass";
47  }
48};
49
50} // End anonymous namespace
51
52char R600ExpandSpecialInstrsPass::ID = 0;
53
54FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
55  return new R600ExpandSpecialInstrsPass(TM);
56}
57
58void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
59    const MachineInstr *OldMI, unsigned Op) {
60  int OpIdx = TII->getOperandIdx(*OldMI, Op);
61  if (OpIdx > -1) {
62    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
63    TII->setImmOperand(*NewMI, Op, Val);
64  }
65}
66
67bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
68  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
69  TII = ST.getInstrInfo();
70
71  const R600RegisterInfo &TRI = TII->getRegisterInfo();
72
73  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
74                                                  BB != BB_E; ++BB) {
75    MachineBasicBlock &MBB = *BB;
76    MachineBasicBlock::iterator I = MBB.begin();
77    while (I != MBB.end()) {
78      MachineInstr &MI = *I;
79      I = std::next(I);
80
81      // Expand LDS_*_RET instructions
82      if (TII->isLDSRetInstr(MI.getOpcode())) {
83        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
84        assert(DstIdx != -1);
85        MachineOperand &DstOp = MI.getOperand(DstIdx);
86        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
87                                               DstOp.getReg(), AMDGPU::OQAP);
88        DstOp.setReg(AMDGPU::OQAP);
89        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
90                                           AMDGPU::OpName::pred_sel);
91        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
92                                           AMDGPU::OpName::pred_sel);
93        // Copy the pred_sel bit
94        Mov->getOperand(MovPredSelIdx).setReg(
95            MI.getOperand(LDSPredSelIdx).getReg());
96      }
97
98      switch (MI.getOpcode()) {
99      default: break;
100      // Expand PRED_X to one of the PRED_SET instructions.
101      case AMDGPU::PRED_X: {
102        uint64_t Flags = MI.getOperand(3).getImm();
103        // The native opcode used by PRED_X is stored as an immediate in the
104        // third operand.
105        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
106                                            MI.getOperand(2).getImm(), // opcode
107                                            MI.getOperand(0).getReg(), // dst
108                                            MI.getOperand(1).getReg(), // src0
109                                            AMDGPU::ZERO);             // src1
110        TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
111        if (Flags & MO_FLAG_PUSH) {
112          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
113        } else {
114          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
115        }
116        MI.eraseFromParent();
117        continue;
118        }
119
120      case AMDGPU::INTERP_PAIR_XY: {
121        MachineInstr *BMI;
122        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
123                MI.getOperand(2).getImm());
124
125        for (unsigned Chan = 0; Chan < 4; ++Chan) {
126          unsigned DstReg;
127
128          if (Chan < 2)
129            DstReg = MI.getOperand(Chan).getReg();
130          else
131            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
132
133          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
134              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
135
136          if (Chan > 0) {
137            BMI->bundleWithPred();
138          }
139          if (Chan >= 2)
140            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
141          if (Chan != 3)
142            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
143        }
144
145        MI.eraseFromParent();
146        continue;
147        }
148
149      case AMDGPU::INTERP_PAIR_ZW: {
150        MachineInstr *BMI;
151        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
152                MI.getOperand(2).getImm());
153
154        for (unsigned Chan = 0; Chan < 4; ++Chan) {
155          unsigned DstReg;
156
157          if (Chan < 2)
158            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
159          else
160            DstReg = MI.getOperand(Chan-2).getReg();
161
162          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
163              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
164
165          if (Chan > 0) {
166            BMI->bundleWithPred();
167          }
168          if (Chan < 2)
169            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
170          if (Chan != 3)
171            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
172        }
173
174        MI.eraseFromParent();
175        continue;
176        }
177
178      case AMDGPU::INTERP_VEC_LOAD: {
179        const R600RegisterInfo &TRI = TII->getRegisterInfo();
180        MachineInstr *BMI;
181        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
182                MI.getOperand(1).getImm());
183        unsigned DstReg = MI.getOperand(0).getReg();
184
185        for (unsigned Chan = 0; Chan < 4; ++Chan) {
186          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
187              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
188          if (Chan > 0) {
189            BMI->bundleWithPred();
190          }
191          if (Chan != 3)
192            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
193        }
194
195        MI.eraseFromParent();
196        continue;
197        }
198      case AMDGPU::DOT_4: {
199
200        const R600RegisterInfo &TRI = TII->getRegisterInfo();
201
202        unsigned DstReg = MI.getOperand(0).getReg();
203        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
204
205        for (unsigned Chan = 0; Chan < 4; ++Chan) {
206          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
207          unsigned SubDstReg =
208              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
209          MachineInstr *BMI =
210              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
211          if (Chan > 0) {
212            BMI->bundleWithPred();
213          }
214          if (Mask) {
215            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
216          }
217          if (Chan != 3)
218            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
219          unsigned Opcode = BMI->getOpcode();
220          // While not strictly necessary from hw point of view, we force
221          // all src operands of a dot4 inst to belong to the same slot.
222          unsigned Src0 = BMI->getOperand(
223              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
224              .getReg();
225          unsigned Src1 = BMI->getOperand(
226              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
227              .getReg();
228          (void) Src0;
229          (void) Src1;
230          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
231              (TRI.getEncodingValue(Src1) & 0xff) < 127)
232            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
233        }
234        MI.eraseFromParent();
235        continue;
236      }
237      }
238
239      bool IsReduction = TII->isReductionOp(MI.getOpcode());
240      bool IsVector = TII->isVector(MI);
241      bool IsCube = TII->isCubeOp(MI.getOpcode());
242      if (!IsReduction && !IsVector && !IsCube) {
243        continue;
244      }
245
246      // Expand the instruction
247      //
248      // Reduction instructions:
249      // T0_X = DP4 T1_XYZW, T2_XYZW
250      // becomes:
251      // TO_X = DP4 T1_X, T2_X
252      // TO_Y (write masked) = DP4 T1_Y, T2_Y
253      // TO_Z (write masked) = DP4 T1_Z, T2_Z
254      // TO_W (write masked) = DP4 T1_W, T2_W
255      //
256      // Vector instructions:
257      // T0_X = MULLO_INT T1_X, T2_X
258      // becomes:
259      // T0_X = MULLO_INT T1_X, T2_X
260      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
261      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
262      // T0_W (write masked) = MULLO_INT T1_X, T2_X
263      //
264      // Cube instructions:
265      // T0_XYZW = CUBE T1_XYZW
266      // becomes:
267      // TO_X = CUBE T1_Z, T1_Y
268      // T0_Y = CUBE T1_Z, T1_X
269      // T0_Z = CUBE T1_X, T1_Z
270      // T0_W = CUBE T1_Y, T1_Z
271      for (unsigned Chan = 0; Chan < 4; Chan++) {
272        unsigned DstReg = MI.getOperand(
273                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
274        unsigned Src0 = MI.getOperand(
275                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
276        unsigned Src1 = 0;
277
278        // Determine the correct source registers
279        if (!IsCube) {
280          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
281          if (Src1Idx != -1) {
282            Src1 = MI.getOperand(Src1Idx).getReg();
283          }
284        }
285        if (IsReduction) {
286          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
287          Src0 = TRI.getSubReg(Src0, SubRegIndex);
288          Src1 = TRI.getSubReg(Src1, SubRegIndex);
289        } else if (IsCube) {
290          static const int CubeSrcSwz[] = {2, 2, 0, 1};
291          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
292          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
293          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
294          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
295        }
296
297        // Determine the correct destination registers;
298        bool Mask = false;
299        bool NotLast = true;
300        if (IsCube) {
301          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
302          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
303        } else {
304          // Mask the write if the original instruction does not write to
305          // the current Channel.
306          Mask = (Chan != TRI.getHWRegChan(DstReg));
307          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
308          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
309        }
310
311        // Set the IsLast bit
312        NotLast = (Chan != 3 );
313
314        // Add the new instruction
315        unsigned Opcode = MI.getOpcode();
316        switch (Opcode) {
317        case AMDGPU::CUBE_r600_pseudo:
318          Opcode = AMDGPU::CUBE_r600_real;
319          break;
320        case AMDGPU::CUBE_eg_pseudo:
321          Opcode = AMDGPU::CUBE_eg_real;
322          break;
323        default:
324          break;
325        }
326
327        MachineInstr *NewMI =
328          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
329
330        if (Chan != 0)
331          NewMI->bundleWithPred();
332        if (Mask) {
333          TII->addFlag(*NewMI, 0, MO_FLAG_MASK);
334        }
335        if (NotLast) {
336          TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
337        }
338        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
339        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
340        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
341        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
342        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
343        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
344      }
345      MI.eraseFromParent();
346    }
347  }
348  return false;
349}
350