R600MachineScheduler.cpp revision cd81d94322a39503e4a3e87b6ee03d4fcb3465fb
1//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief R600 Machine Scheduler interface
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600MachineScheduler.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/CodeGen/LiveIntervalAnalysis.h"
18#include "llvm/CodeGen/MachineRegisterInfo.h"
19#include "llvm/Pass.h"
20#include "llvm/PassManager.h"
21#include "llvm/Support/raw_ostream.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "misched"
26
27void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
28  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
29  DAG = static_cast<ScheduleDAGMILive*>(dag);
30  TII = static_cast<const R600InstrInfo*>(DAG->TII);
31  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
32  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
33  MRI = &DAG->MRI;
34  CurInstKind = IDOther;
35  CurEmitted = 0;
36  OccupedSlotsMask = 31;
37  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
38  InstKindLimit[IDOther] = 32;
39
40  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
41  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
42  AluInstCount = 0;
43  FetchInstCount = 0;
44}
45
46void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
47                                  std::vector<SUnit *> &QDst)
48{
49  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
50  QSrc.clear();
51}
52
53static
54unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
55  assert (GPRCount && "GPRCount cannot be 0");
56  return 248 / GPRCount;
57}
58
59SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
60  SUnit *SU = nullptr;
61  NextInstKind = IDOther;
62
63  IsTopNode = false;
64
65  // check if we might want to switch current clause type
66  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
67      (Available[CurInstKind].empty());
68  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
69      (!Available[IDFetch].empty() || !Available[IDOther].empty());
70
71  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
72    // We use the heuristic provided by AMD Accelerated Parallel Processing
73    // OpenCL Programming Guide :
74    // The approx. number of WF that allows TEX inst to hide ALU inst is :
75    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
76    float ALUFetchRationEstimate =
77        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
78        (FetchInstCount + Available[IDFetch].size());
79    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
80    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
81    // We assume the local GPR requirements to be "dominated" by the requirement
82    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
83    // after TEX are indeed likely to consume or generate values from/for the
84    // TEX clause.
85    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
86    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
87    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
88    // (TODO : use RegisterPressure)
89    // If we are going too use too many GPR, we flush Fetch instruction to lower
90    // register pressure on 128 bits regs.
91    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
92    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
93      AllowSwitchFromAlu = true;
94  }
95
96  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
97      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
98    // try to pick ALU
99    SU = pickAlu();
100    if (!SU && !PhysicalRegCopy.empty()) {
101      SU = PhysicalRegCopy.front();
102      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
103    }
104    if (SU) {
105      if (CurEmitted >= InstKindLimit[IDAlu])
106        CurEmitted = 0;
107      NextInstKind = IDAlu;
108    }
109  }
110
111  if (!SU) {
112    // try to pick FETCH
113    SU = pickOther(IDFetch);
114    if (SU)
115      NextInstKind = IDFetch;
116  }
117
118  // try to pick other
119  if (!SU) {
120    SU = pickOther(IDOther);
121    if (SU)
122      NextInstKind = IDOther;
123  }
124
125  DEBUG(
126      if (SU) {
127        dbgs() << " ** Pick node **\n";
128        SU->dump(DAG);
129      } else {
130        dbgs() << "NO NODE \n";
131        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
132          const SUnit &S = DAG->SUnits[i];
133          if (!S.isScheduled)
134            S.dump(DAG);
135        }
136      }
137  );
138
139  return SU;
140}
141
142void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
143  if (NextInstKind != CurInstKind) {
144    DEBUG(dbgs() << "Instruction Type Switch\n");
145    if (NextInstKind != IDAlu)
146      OccupedSlotsMask |= 31;
147    CurEmitted = 0;
148    CurInstKind = NextInstKind;
149  }
150
151  if (CurInstKind == IDAlu) {
152    AluInstCount ++;
153    switch (getAluKind(SU)) {
154    case AluT_XYZW:
155      CurEmitted += 4;
156      break;
157    case AluDiscarded:
158      break;
159    default: {
160      ++CurEmitted;
161      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
162          E = SU->getInstr()->operands_end(); It != E; ++It) {
163        MachineOperand &MO = *It;
164        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
165          ++CurEmitted;
166      }
167    }
168    }
169  } else {
170    ++CurEmitted;
171  }
172
173
174  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
175
176  if (CurInstKind != IDFetch) {
177    MoveUnits(Pending[IDFetch], Available[IDFetch]);
178  } else
179    FetchInstCount++;
180}
181
182static bool
183isPhysicalRegCopy(MachineInstr *MI) {
184  if (MI->getOpcode() != AMDGPU::COPY)
185    return false;
186
187  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
188}
189
190void R600SchedStrategy::releaseTopNode(SUnit *SU) {
191  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
192}
193
194void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
195  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
196  if (isPhysicalRegCopy(SU->getInstr())) {
197    PhysicalRegCopy.push_back(SU);
198    return;
199  }
200
201  int IK = getInstKind(SU);
202
203  // There is no export clause, we can schedule one as soon as its ready
204  if (IK == IDOther)
205    Available[IDOther].push_back(SU);
206  else
207    Pending[IK].push_back(SU);
208
209}
210
211bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
212                                          const TargetRegisterClass *RC) const {
213  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
214    return RC->contains(Reg);
215  } else {
216    return MRI->getRegClass(Reg) == RC;
217  }
218}
219
220R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
221  MachineInstr *MI = SU->getInstr();
222
223  if (TII->isTransOnly(MI))
224    return AluTrans;
225
226    switch (MI->getOpcode()) {
227    case AMDGPU::PRED_X:
228      return AluPredX;
229    case AMDGPU::INTERP_PAIR_XY:
230    case AMDGPU::INTERP_PAIR_ZW:
231    case AMDGPU::INTERP_VEC_LOAD:
232    case AMDGPU::DOT_4:
233      return AluT_XYZW;
234    case AMDGPU::COPY:
235      if (MI->getOperand(1).isUndef()) {
236        // MI will become a KILL, don't considers it in scheduling
237        return AluDiscarded;
238      }
239    default:
240      break;
241    }
242
243    // Does the instruction take a whole IG ?
244    // XXX: Is it possible to add a helper function in R600InstrInfo that can
245    // be used here and in R600PacketizerList::isSoloInstruction() ?
246    if(TII->isVector(*MI) ||
247        TII->isCubeOp(MI->getOpcode()) ||
248        TII->isReductionOp(MI->getOpcode()) ||
249        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
250      return AluT_XYZW;
251    }
252
253    if (TII->isLDSInstr(MI->getOpcode())) {
254      return AluT_X;
255    }
256
257    // Is the result already assigned to a channel ?
258    unsigned DestSubReg = MI->getOperand(0).getSubReg();
259    switch (DestSubReg) {
260    case AMDGPU::sub0:
261      return AluT_X;
262    case AMDGPU::sub1:
263      return AluT_Y;
264    case AMDGPU::sub2:
265      return AluT_Z;
266    case AMDGPU::sub3:
267      return AluT_W;
268    default:
269      break;
270    }
271
272    // Is the result already member of a X/Y/Z/W class ?
273    unsigned DestReg = MI->getOperand(0).getReg();
274    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
275        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
276      return AluT_X;
277    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
278      return AluT_Y;
279    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
280      return AluT_Z;
281    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
282      return AluT_W;
283    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
284      return AluT_XYZW;
285
286    // LDS src registers cannot be used in the Trans slot.
287    if (TII->readsLDSSrcReg(MI))
288      return AluT_XYZW;
289
290    return AluAny;
291
292}
293
294int R600SchedStrategy::getInstKind(SUnit* SU) {
295  int Opcode = SU->getInstr()->getOpcode();
296
297  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
298    return IDFetch;
299
300  if (TII->isALUInstr(Opcode)) {
301    return IDAlu;
302  }
303
304  switch (Opcode) {
305  case AMDGPU::PRED_X:
306  case AMDGPU::COPY:
307  case AMDGPU::CONST_COPY:
308  case AMDGPU::INTERP_PAIR_XY:
309  case AMDGPU::INTERP_PAIR_ZW:
310  case AMDGPU::INTERP_VEC_LOAD:
311  case AMDGPU::DOT_4:
312    return IDAlu;
313  default:
314    return IDOther;
315  }
316}
317
318SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
319  if (Q.empty())
320    return nullptr;
321  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
322      It != E; ++It) {
323    SUnit *SU = *It;
324    InstructionsGroupCandidate.push_back(SU->getInstr());
325    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
326        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
327    ) {
328      InstructionsGroupCandidate.pop_back();
329      Q.erase((It + 1).base());
330      return SU;
331    } else {
332      InstructionsGroupCandidate.pop_back();
333    }
334  }
335  return nullptr;
336}
337
338void R600SchedStrategy::LoadAlu() {
339  std::vector<SUnit *> &QSrc = Pending[IDAlu];
340  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
341    AluKind AK = getAluKind(QSrc[i]);
342    AvailableAlus[AK].push_back(QSrc[i]);
343  }
344  QSrc.clear();
345}
346
347void R600SchedStrategy::PrepareNextSlot() {
348  DEBUG(dbgs() << "New Slot\n");
349  assert (OccupedSlotsMask && "Slot wasn't filled");
350  OccupedSlotsMask = 0;
351//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
352//    OccupedSlotsMask |= 16;
353  InstructionsGroupCandidate.clear();
354  LoadAlu();
355}
356
357void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
358  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
359  if (DstIndex == -1) {
360    return;
361  }
362  unsigned DestReg = MI->getOperand(DstIndex).getReg();
363  // PressureRegister crashes if an operand is def and used in the same inst
364  // and we try to constraint its regclass
365  for (MachineInstr::mop_iterator It = MI->operands_begin(),
366      E = MI->operands_end(); It != E; ++It) {
367    MachineOperand &MO = *It;
368    if (MO.isReg() && !MO.isDef() &&
369        MO.getReg() == DestReg)
370      return;
371  }
372  // Constrains the regclass of DestReg to assign it to Slot
373  switch (Slot) {
374  case 0:
375    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
376    break;
377  case 1:
378    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
379    break;
380  case 2:
381    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
382    break;
383  case 3:
384    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
385    break;
386  }
387}
388
389SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
390  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
391  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
392  if (SlotedSU)
393    return SlotedSU;
394  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
395  if (UnslotedSU)
396    AssignSlot(UnslotedSU->getInstr(), Slot);
397  return UnslotedSU;
398}
399
400unsigned R600SchedStrategy::AvailablesAluCount() const {
401  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
402      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
403      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
404      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
405      AvailableAlus[AluPredX].size();
406}
407
408SUnit* R600SchedStrategy::pickAlu() {
409  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
410    if (!OccupedSlotsMask) {
411      // Bottom up scheduling : predX must comes first
412      if (!AvailableAlus[AluPredX].empty()) {
413        OccupedSlotsMask |= 31;
414        return PopInst(AvailableAlus[AluPredX], false);
415      }
416      // Flush physical reg copies (RA will discard them)
417      if (!AvailableAlus[AluDiscarded].empty()) {
418        OccupedSlotsMask |= 31;
419        return PopInst(AvailableAlus[AluDiscarded], false);
420      }
421      // If there is a T_XYZW alu available, use it
422      if (!AvailableAlus[AluT_XYZW].empty()) {
423        OccupedSlotsMask |= 15;
424        return PopInst(AvailableAlus[AluT_XYZW], false);
425      }
426    }
427    bool TransSlotOccuped = OccupedSlotsMask & 16;
428    if (!TransSlotOccuped && VLIW5) {
429      if (!AvailableAlus[AluTrans].empty()) {
430        OccupedSlotsMask |= 16;
431        return PopInst(AvailableAlus[AluTrans], false);
432      }
433      SUnit *SU = AttemptFillSlot(3, true);
434      if (SU) {
435        OccupedSlotsMask |= 16;
436        return SU;
437      }
438    }
439    for (int Chan = 3; Chan > -1; --Chan) {
440      bool isOccupied = OccupedSlotsMask & (1 << Chan);
441      if (!isOccupied) {
442        SUnit *SU = AttemptFillSlot(Chan, false);
443        if (SU) {
444          OccupedSlotsMask |= (1 << Chan);
445          InstructionsGroupCandidate.push_back(SU->getInstr());
446          return SU;
447        }
448      }
449    }
450    PrepareNextSlot();
451  }
452  return nullptr;
453}
454
455SUnit* R600SchedStrategy::pickOther(int QID) {
456  SUnit *SU = nullptr;
457  std::vector<SUnit *> &AQ = Available[QID];
458
459  if (AQ.empty()) {
460    MoveUnits(Pending[QID], AQ);
461  }
462  if (!AQ.empty()) {
463    SU = AQ.back();
464    AQ.resize(AQ.size() - 1);
465  }
466  return SU;
467}
468