1//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11///
12/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13/// code.  When passed an MCAsmStreamer it prints assembly and when passed
14/// an MCObjectStreamer it outputs binary code.
15//
16//===----------------------------------------------------------------------===//
17//
18
19
20#include "AMDGPUAsmPrinter.h"
21#include "AMDGPU.h"
22#include "AMDGPUSubtarget.h"
23#include "R600Defines.h"
24#include "R600MachineFunctionInfo.h"
25#include "R600RegisterInfo.h"
26#include "SIDefines.h"
27#include "SIMachineFunctionInfo.h"
28#include "SIRegisterInfo.h"
29#include "llvm/MC/MCContext.h"
30#include "llvm/MC/MCSectionELF.h"
31#include "llvm/MC/MCStreamer.h"
32#include "llvm/Support/ELF.h"
33#include "llvm/Support/MathExtras.h"
34#include "llvm/Support/TargetRegistry.h"
35#include "llvm/Target/TargetLoweringObjectFile.h"
36
37using namespace llvm;
38
39// TODO: This should get the default rounding mode from the kernel. We just set
40// the default here, but this could change if the OpenCL rounding mode pragmas
41// are used.
42//
43// The denormal mode here should match what is reported by the OpenCL runtime
44// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
45// can also be override to flush with the -cl-denorms-are-zero compiler flag.
46//
47// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
48// precision, and leaves single precision to flush all and does not report
49// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
50// CL_FP_DENORM for both.
51static uint32_t getFPMode(MachineFunction &) {
52  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
53         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
54         FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) |
55         FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE);
56}
57
58static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
59                                              MCStreamer &Streamer) {
60  return new AMDGPUAsmPrinter(tm, Streamer);
61}
62
63extern "C" void LLVMInitializeR600AsmPrinter() {
64  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
65}
66
67AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
68    : AsmPrinter(TM, Streamer) {
69  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
70}
71
72bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
73  SetupMachineFunction(MF);
74
75  OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
76
77  MCContext &Context = getObjFileLowering().getContext();
78  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
79                                              ELF::SHT_PROGBITS, 0,
80                                              SectionKind::getReadOnly());
81  OutStreamer.SwitchSection(ConfigSection);
82
83  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
84  SIProgramInfo KernelInfo;
85  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
86    getSIProgramInfo(KernelInfo, MF);
87    EmitProgramInfoSI(MF, KernelInfo);
88  } else {
89    EmitProgramInfoR600(MF);
90  }
91
92  DisasmLines.clear();
93  HexLines.clear();
94  DisasmLineMaxLen = 0;
95
96  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
97  EmitFunctionBody();
98
99  if (isVerbose()) {
100    const MCSectionELF *CommentSection
101      = Context.getELFSection(".AMDGPU.csdata",
102                              ELF::SHT_PROGBITS, 0,
103                              SectionKind::getReadOnly());
104    OutStreamer.SwitchSection(CommentSection);
105
106    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
107      OutStreamer.emitRawComment(" Kernel info:", false);
108      OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
109                                 false);
110      OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
111                                 false);
112      OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
113                                 false);
114      OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
115                                 false);
116      OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
117                                 false);
118    } else {
119      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
120      OutStreamer.emitRawComment(
121        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
122    }
123  }
124
125  if (STM.dumpCode()) {
126#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
127    MF.dump();
128#endif
129
130    if (DisasmEnabled) {
131      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
132                                                  ELF::SHT_NOTE, 0,
133                                                  SectionKind::getReadOnly()));
134
135      for (size_t i = 0; i < DisasmLines.size(); ++i) {
136        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
137        Comment += " ; " + HexLines[i] + "\n";
138
139        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
140        OutStreamer.EmitBytes(StringRef(Comment));
141      }
142    }
143  }
144
145  return false;
146}
147
148void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
149  unsigned MaxGPR = 0;
150  bool killPixel = false;
151  const R600RegisterInfo * RI =
152                static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
153  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
154  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
155
156  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
157                                                  BB != BB_E; ++BB) {
158    MachineBasicBlock &MBB = *BB;
159    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
160                                                    I != E; ++I) {
161      MachineInstr &MI = *I;
162      if (MI.getOpcode() == AMDGPU::KILLGT)
163        killPixel = true;
164      unsigned numOperands = MI.getNumOperands();
165      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
166        MachineOperand & MO = MI.getOperand(op_idx);
167        if (!MO.isReg())
168          continue;
169        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
170
171        // Register with value > 127 aren't GPR
172        if (HWReg > 127)
173          continue;
174        MaxGPR = std::max(MaxGPR, HWReg);
175      }
176    }
177  }
178
179  unsigned RsrcReg;
180  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
181    // Evergreen / Northern Islands
182    switch (MFI->ShaderType) {
183    default: // Fall through
184    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
185    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
186    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
187    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
188    }
189  } else {
190    // R600 / R700
191    switch (MFI->ShaderType) {
192    default: // Fall through
193    case ShaderType::GEOMETRY: // Fall through
194    case ShaderType::COMPUTE:  // Fall through
195    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
196    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
197    }
198  }
199
200  OutStreamer.EmitIntValue(RsrcReg, 4);
201  OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
202                           S_STACK_SIZE(MFI->StackSize), 4);
203  OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
204  OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
205
206  if (MFI->ShaderType == ShaderType::COMPUTE) {
207    OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
208    OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
209  }
210}
211
212void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
213                                        MachineFunction &MF) const {
214  uint64_t CodeSize = 0;
215  unsigned MaxSGPR = 0;
216  unsigned MaxVGPR = 0;
217  bool VCCUsed = false;
218  const SIRegisterInfo * RI =
219                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
220
221  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
222                                                  BB != BB_E; ++BB) {
223    MachineBasicBlock &MBB = *BB;
224    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
225                                                    I != E; ++I) {
226      MachineInstr &MI = *I;
227
228      // TODO: CodeSize should account for multiple functions.
229      CodeSize += MI.getDesc().Size;
230
231      unsigned numOperands = MI.getNumOperands();
232      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
233        MachineOperand &MO = MI.getOperand(op_idx);
234        unsigned width = 0;
235        bool isSGPR = false;
236
237        if (!MO.isReg()) {
238          continue;
239        }
240        unsigned reg = MO.getReg();
241        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
242	    reg == AMDGPU::VCC_HI) {
243          VCCUsed = true;
244          continue;
245        }
246
247        switch (reg) {
248        default: break;
249        case AMDGPU::SCC:
250        case AMDGPU::EXEC:
251        case AMDGPU::M0:
252          continue;
253        }
254
255        if (AMDGPU::SReg_32RegClass.contains(reg)) {
256          isSGPR = true;
257          width = 1;
258        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
259          isSGPR = false;
260          width = 1;
261        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
262          isSGPR = true;
263          width = 2;
264        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
265          isSGPR = false;
266          width = 2;
267        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
268          isSGPR = false;
269          width = 3;
270        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
271          isSGPR = true;
272          width = 4;
273        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
274          isSGPR = false;
275          width = 4;
276        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
277          isSGPR = true;
278          width = 8;
279        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
280          isSGPR = false;
281          width = 8;
282        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
283          isSGPR = true;
284          width = 16;
285        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
286          isSGPR = false;
287          width = 16;
288        } else {
289          llvm_unreachable("Unknown register class");
290        }
291        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
292        unsigned maxUsed = hwReg + width - 1;
293        if (isSGPR) {
294          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
295        } else {
296          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
297        }
298      }
299    }
300  }
301
302  if (VCCUsed)
303    MaxSGPR += 2;
304
305  ProgInfo.NumVGPR = MaxVGPR;
306  ProgInfo.NumSGPR = MaxSGPR;
307
308  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
309  // register.
310  ProgInfo.FloatMode = getFPMode(MF);
311
312  // XXX: Not quite sure what this does, but sc seems to unset this.
313  ProgInfo.IEEEMode = 0;
314
315  // Do not clamp NAN to 0.
316  ProgInfo.DX10Clamp = 0;
317
318  ProgInfo.CodeLen = CodeSize;
319}
320
321void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
322                                         const SIProgramInfo &KernelInfo) {
323  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
324  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
325
326  unsigned RsrcReg;
327  switch (MFI->ShaderType) {
328  default: // Fall through
329  case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
330  case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
331  case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
332  case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
333  }
334
335  unsigned LDSAlignShift;
336  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
337    // LDS is allocated in 64 dword blocks.
338    LDSAlignShift = 8;
339  } else {
340    // LDS is allocated in 128 dword blocks.
341    LDSAlignShift = 9;
342  }
343
344  unsigned LDSBlocks =
345    RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
346
347  if (MFI->ShaderType == ShaderType::COMPUTE) {
348    OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
349
350    const uint32_t ComputePGMRSrc1 =
351      S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
352      S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
353      S_00B848_PRIORITY(KernelInfo.Priority) |
354      S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
355      S_00B848_PRIV(KernelInfo.Priv) |
356      S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
357      S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
358      S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
359
360    OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
361
362    OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
363    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
364  } else {
365    OutStreamer.EmitIntValue(RsrcReg, 4);
366    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
367                             S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
368  }
369
370  if (MFI->ShaderType == ShaderType::PIXEL) {
371    OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
372    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
373    OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
374    OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
375  }
376}
377