1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//==-----------------------------------------------------------------------===//
9
10#include "SIFrameLowering.h"
11#include "SIInstrInfo.h"
12#include "SIMachineFunctionInfo.h"
13#include "SIRegisterInfo.h"
14#include "AMDGPUSubtarget.h"
15
16#include "llvm/CodeGen/MachineFrameInfo.h"
17#include "llvm/CodeGen/MachineFunction.h"
18#include "llvm/CodeGen/MachineInstrBuilder.h"
19#include "llvm/CodeGen/RegisterScavenging.h"
20
21using namespace llvm;
22
23
24static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
25                              const MachineFrameInfo *FrameInfo) {
26  return FuncInfo->hasSpilledSGPRs() &&
27    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
28}
29
30static ArrayRef<MCPhysReg> getAllSGPR128() {
31  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
32                      AMDGPU::SGPR_128RegClass.getNumRegs());
33}
34
35static ArrayRef<MCPhysReg> getAllSGPRs() {
36  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
37                      AMDGPU::SGPR_32RegClass.getNumRegs());
38}
39
40void SIFrameLowering::emitPrologue(MachineFunction &MF,
41                                   MachineBasicBlock &MBB) const {
42  // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
43  // specified.
44  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
45  if (ST.debuggerEmitPrologue())
46    emitDebuggerPrologue(MF, MBB);
47
48  if (!MF.getFrameInfo()->hasStackObjects())
49    return;
50
51  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
52
53  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
54
55  // If we only have SGPR spills, we won't actually be using scratch memory
56  // since these spill to VGPRs.
57  //
58  // FIXME: We should be cleaning up these unused SGPR spill frame indices
59  // somewhere.
60  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
61    return;
62
63  const SIInstrInfo *TII = ST.getInstrInfo();
64  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
65  MachineRegisterInfo &MRI = MF.getRegInfo();
66  MachineBasicBlock::iterator I = MBB.begin();
67
68  // We need to insert initialization of the scratch resource descriptor.
69  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
70  assert(ScratchRsrcReg != AMDGPU::NoRegister);
71
72  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
73  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
74
75  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
76    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
77
78  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
79  if (ST.isAmdHsaOS()) {
80    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
81      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
82  }
83
84  if (MFI->hasFlatScratchInit()) {
85    // We don't need this if we only have spills since there is no user facing
86    // scratch.
87
88    // TODO: If we know we don't have flat instructions earlier, we can omit
89    // this from the input registers.
90    //
91    // TODO: We only need to know if we access scratch space through a flat
92    // pointer. Because we only detect if flat instructions are used at all,
93    // this will be used more often than necessary on VI.
94
95    // Debug location must be unknown since the first debug location is used to
96    // determine the end of the prologue.
97    DebugLoc DL;
98
99    unsigned FlatScratchInitReg
100      = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
101
102    MRI.addLiveIn(FlatScratchInitReg);
103    MBB.addLiveIn(FlatScratchInitReg);
104
105    // Copy the size in bytes.
106    unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
107    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
108      .addReg(FlatScrInitHi, RegState::Kill);
109
110    unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
111
112    // Add wave offset in bytes to private base offset.
113    // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
114    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
115      .addReg(FlatScrInitLo)
116      .addReg(ScratchWaveOffsetReg);
117
118    // Convert offset to 256-byte units.
119    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
120      .addReg(FlatScrInitLo, RegState::Kill)
121      .addImm(8);
122  }
123
124  // If we reserved the original input registers, we don't need to copy to the
125  // reserved registers.
126  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
127    // We should always reserve these 5 registers at the same time.
128    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
129           "scratch wave offset and private segment buffer inconsistent");
130    return;
131  }
132
133
134  // We added live-ins during argument lowering, but since they were not used
135  // they were deleted. We're adding the uses now, so add them back.
136  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
137  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
138
139  if (ST.isAmdHsaOS()) {
140    MRI.addLiveIn(PreloadedPrivateBufferReg);
141    MBB.addLiveIn(PreloadedPrivateBufferReg);
142  }
143
144  if (!ST.hasSGPRInitBug()) {
145    // We reserved the last registers for this. Shift it down to the end of those
146    // which were actually used.
147    //
148    // FIXME: It might be safer to use a pseudoregister before replacement.
149
150    // FIXME: We should be able to eliminate unused input registers. We only
151    // cannot do this for the resources required for scratch access. For now we
152    // skip over user SGPRs and may leave unused holes.
153
154    // We find the resource first because it has an alignment requirement.
155    if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
156      MachineRegisterInfo &MRI = MF.getRegInfo();
157
158      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
159      // Skip the last 2 elements because the last one is reserved for VCC, and
160      // this is the 2nd to last element already.
161      for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
162        // Pick the first unallocated one. Make sure we don't clobber the other
163        // reserved input we needed.
164        if (!MRI.isPhysRegUsed(Reg)) {
165          assert(MRI.isAllocatable(Reg));
166          MRI.replaceRegWith(ScratchRsrcReg, Reg);
167          ScratchRsrcReg = Reg;
168          MFI->setScratchRSrcReg(ScratchRsrcReg);
169          break;
170        }
171      }
172    }
173
174    if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
175      MachineRegisterInfo &MRI = MF.getRegInfo();
176      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
177
178      // We need to drop register from the end of the list that we cannot use
179      // for the scratch wave offset.
180      // + 2 s102 and s103 do not exist on VI.
181      // + 2 for vcc
182      // + 2 for xnack_mask
183      // + 2 for flat_scratch
184      // + 4 for registers reserved for scratch resource register
185      // + 1 for register reserved for scratch wave offset.  (By exluding this
186      //     register from the list to consider, it means that when this
187      //     register is being used for the scratch wave offset and there
188      //     are no other free SGPRs, then the value will stay in this register.
189      // ----
190      //  13
191      for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
192        // Pick the first unallocated SGPR. Be careful not to pick an alias of the
193        // scratch descriptor, since we haven’t added its uses yet.
194        if (!MRI.isPhysRegUsed(Reg)) {
195          if (!MRI.isAllocatable(Reg) ||
196              TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
197            continue;
198
199          MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
200          ScratchWaveOffsetReg = Reg;
201          MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
202          break;
203        }
204      }
205    }
206  }
207
208
209  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
210
211  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
212  DebugLoc DL;
213
214  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
215    // Make sure we emit the copy for the offset first. We may have chosen to copy
216    // the buffer resource into a register that aliases the input offset register.
217    BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
218      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
219  }
220
221  if (ST.isAmdHsaOS()) {
222    // Insert copies from argument register.
223    assert(
224      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
225      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
226
227    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
228    unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
229
230    unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
231    unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
232
233    const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
234
235    BuildMI(MBB, I, DL, SMovB64, Rsrc01)
236      .addReg(Lo, RegState::Kill);
237    BuildMI(MBB, I, DL, SMovB64, Rsrc23)
238      .addReg(Hi, RegState::Kill);
239  } else {
240    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
241    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
242    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
243    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
244
245    // Use relocations to get the pointer, and setup the other bits manually.
246    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
247    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
248      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
249      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
250
251    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
252      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
253      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
254
255    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
256      .addImm(Rsrc23 & 0xffffffff)
257      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
258
259    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
260      .addImm(Rsrc23 >> 32)
261      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
262  }
263
264  // Make the register selected live throughout the function.
265  for (MachineBasicBlock &OtherBB : MF) {
266    if (&OtherBB == &MBB)
267      continue;
268
269    OtherBB.addLiveIn(ScratchRsrcReg);
270    OtherBB.addLiveIn(ScratchWaveOffsetReg);
271  }
272}
273
274void SIFrameLowering::emitEpilogue(MachineFunction &MF,
275                                   MachineBasicBlock &MBB) const {
276
277}
278
279void SIFrameLowering::processFunctionBeforeFrameFinalized(
280  MachineFunction &MF,
281  RegScavenger *RS) const {
282  MachineFrameInfo *MFI = MF.getFrameInfo();
283
284  if (!MFI->hasStackObjects())
285    return;
286
287  bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
288
289  assert((RS || !MayNeedScavengingEmergencySlot) &&
290         "RegScavenger required if spilling");
291
292  if (MayNeedScavengingEmergencySlot) {
293    int ScavengeFI = MFI->CreateSpillStackObject(
294      AMDGPU::SGPR_32RegClass.getSize(),
295      AMDGPU::SGPR_32RegClass.getAlignment());
296    RS->addScavengingFrameIndex(ScavengeFI);
297  }
298}
299
300void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
301                                           MachineBasicBlock &MBB) const {
302  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
303  const SIInstrInfo *TII = ST.getInstrInfo();
304  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
305  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
306
307  MachineBasicBlock::iterator I = MBB.begin();
308  DebugLoc DL;
309
310  // For each dimension:
311  for (unsigned i = 0; i < 3; ++i) {
312    // Get work group ID SGPR, and make it live-in again.
313    unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
314    MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
315    MBB.addLiveIn(WorkGroupIDSGPR);
316
317    // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
318    // order to spill it to scratch.
319    unsigned WorkGroupIDVGPR =
320      MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
321    BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
322      .addReg(WorkGroupIDSGPR);
323
324    // Spill work group ID.
325    int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
326    TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
327      WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
328
329    // Get work item ID VGPR, and make it live-in again.
330    unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
331    MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
332    MBB.addLiveIn(WorkItemIDVGPR);
333
334    // Spill work item ID.
335    int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
336    TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
337      WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
338  }
339}
340