1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass eliminates allocas by either converting them into vectors or
11// by migrating them to local address space.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/InstVisitor.h"
20#include "llvm/Support/Debug.h"
21#include "llvm/Support/raw_ostream.h"
22
23#define DEBUG_TYPE "amdgpu-promote-alloca"
24
25using namespace llvm;
26
27namespace {
28
29class AMDGPUPromoteAlloca : public FunctionPass,
30                       public InstVisitor<AMDGPUPromoteAlloca> {
31
32  static char ID;
33  Module *Mod;
34  const AMDGPUSubtarget &ST;
35  int LocalMemAvailable;
36
37public:
38  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
39                                                   LocalMemAvailable(0) { }
40  bool doInitialization(Module &M) override;
41  bool runOnFunction(Function &F) override;
42  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
43  void visitAlloca(AllocaInst &I);
44};
45
46} // End anonymous namespace
47
48char AMDGPUPromoteAlloca::ID = 0;
49
50bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
51  Mod = &M;
52  return false;
53}
54
55bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
56
57  FunctionType *FTy = F.getFunctionType();
58
59  LocalMemAvailable = ST.getLocalMemorySize();
60
61
62  // If the function has any arguments in the local address space, then it's
63  // possible these arguments require the entire local memory space, so
64  // we cannot use local memory in the pass.
65  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
66    Type *ParamTy = FTy->getParamType(i);
67    if (ParamTy->isPointerTy() &&
68        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
69      LocalMemAvailable = 0;
70      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
71                      "local memory disabled.\n");
72      break;
73    }
74  }
75
76  if (LocalMemAvailable > 0) {
77    // Check how much local memory is being used by global objects
78    for (Module::global_iterator I = Mod->global_begin(),
79                                 E = Mod->global_end(); I != E; ++I) {
80      GlobalVariable *GV = &*I;
81      PointerType *GVTy = GV->getType();
82      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
83        continue;
84      for (Value::use_iterator U = GV->use_begin(),
85                               UE = GV->use_end(); U != UE; ++U) {
86        Instruction *Use = dyn_cast<Instruction>(*U);
87        if (!Use)
88          continue;
89        if (Use->getParent()->getParent() == &F)
90          LocalMemAvailable -=
91              Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
92      }
93    }
94  }
95
96  LocalMemAvailable = std::max(0, LocalMemAvailable);
97  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
98
99  visit(F);
100
101  return false;
102}
103
104static VectorType *arrayTypeToVecType(Type *ArrayTy) {
105  return VectorType::get(ArrayTy->getArrayElementType(),
106                         ArrayTy->getArrayNumElements());
107}
108
109static Value *
110calculateVectorIndex(Value *Ptr,
111                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
112  if (isa<AllocaInst>(Ptr))
113    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
114
115  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
116
117  auto I = GEPIdx.find(GEP);
118  return I == GEPIdx.end() ? nullptr : I->second;
119}
120
121static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
122  // FIXME we only support simple cases
123  if (GEP->getNumOperands() != 3)
124    return NULL;
125
126  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
127  if (!I0 || !I0->isZero())
128    return NULL;
129
130  return GEP->getOperand(2);
131}
132
133// Not an instruction handled below to turn into a vector.
134//
135// TODO: Check isTriviallyVectorizable for calls and handle other
136// instructions.
137static bool canVectorizeInst(Instruction *Inst, User *User) {
138  switch (Inst->getOpcode()) {
139  case Instruction::Load:
140  case Instruction::BitCast:
141  case Instruction::AddrSpaceCast:
142    return true;
143  case Instruction::Store: {
144    // Must be the stored pointer operand, not a stored value.
145    StoreInst *SI = cast<StoreInst>(Inst);
146    return SI->getPointerOperand() == User;
147  }
148  default:
149    return false;
150  }
151}
152
153static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
154  Type *AllocaTy = Alloca->getAllocatedType();
155
156  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
157
158  // FIXME: There is no reason why we can't support larger arrays, we
159  // are just being conservative for now.
160  if (!AllocaTy->isArrayTy() ||
161      AllocaTy->getArrayElementType()->isVectorTy() ||
162      AllocaTy->getArrayNumElements() > 4) {
163
164    DEBUG(dbgs() << "  Cannot convert type to vector");
165    return false;
166  }
167
168  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
169  std::vector<Value*> WorkList;
170  for (User *AllocaUser : Alloca->users()) {
171    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
172    if (!GEP) {
173      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
174        return false;
175
176      WorkList.push_back(AllocaUser);
177      continue;
178    }
179
180    Value *Index = GEPToVectorIndex(GEP);
181
182    // If we can't compute a vector index from this GEP, then we can't
183    // promote this alloca to vector.
184    if (!Index) {
185      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
186      return false;
187    }
188
189    GEPVectorIdx[GEP] = Index;
190    for (User *GEPUser : AllocaUser->users()) {
191      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
192        return false;
193
194      WorkList.push_back(GEPUser);
195    }
196  }
197
198  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
199
200  DEBUG(dbgs() << "  Converting alloca to vector "
201        << *AllocaTy << " -> " << *VectorTy << '\n');
202
203  for (std::vector<Value*>::iterator I = WorkList.begin(),
204                                     E = WorkList.end(); I != E; ++I) {
205    Instruction *Inst = cast<Instruction>(*I);
206    IRBuilder<> Builder(Inst);
207    switch (Inst->getOpcode()) {
208    case Instruction::Load: {
209      Value *Ptr = Inst->getOperand(0);
210      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
211      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
212      Value *VecValue = Builder.CreateLoad(BitCast);
213      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
214      Inst->replaceAllUsesWith(ExtractElement);
215      Inst->eraseFromParent();
216      break;
217    }
218    case Instruction::Store: {
219      Value *Ptr = Inst->getOperand(1);
220      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
221      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
222      Value *VecValue = Builder.CreateLoad(BitCast);
223      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
224                                                       Inst->getOperand(0),
225                                                       Index);
226      Builder.CreateStore(NewVecValue, BitCast);
227      Inst->eraseFromParent();
228      break;
229    }
230    case Instruction::BitCast:
231    case Instruction::AddrSpaceCast:
232      break;
233
234    default:
235      Inst->dump();
236      llvm_unreachable("Inconsistency in instructions promotable to vector");
237    }
238  }
239  return true;
240}
241
242static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
243  bool Success = true;
244  for (User *User : Val->users()) {
245    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
246      continue;
247    if (CallInst *CI = dyn_cast<CallInst>(User)) {
248      // TODO: We might be able to handle some cases where the callee is a
249      // constantexpr bitcast of a function.
250      if (!CI->getCalledFunction())
251        return false;
252
253      WorkList.push_back(User);
254      continue;
255    }
256
257    // FIXME: Correctly handle ptrtoint instructions.
258    Instruction *UseInst = dyn_cast<Instruction>(User);
259    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
260      return false;
261
262    if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
263      // Reject if the stored value is not the pointer operand.
264      if (SI->getPointerOperand() != Val)
265        return false;
266    }
267
268    if (!User->getType()->isPointerTy())
269      continue;
270
271    WorkList.push_back(User);
272
273    Success &= collectUsesWithPtrTypes(User, WorkList);
274  }
275  return Success;
276}
277
278void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
279  if (!I.isStaticAlloca())
280    return;
281
282  IRBuilder<> Builder(&I);
283
284  // First try to replace the alloca with a vector
285  Type *AllocaTy = I.getAllocatedType();
286
287  DEBUG(dbgs() << "Trying to promote " << I << '\n');
288
289  if (tryPromoteAllocaToVector(&I))
290    return;
291
292  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
293
294  // FIXME: This is the maximum work group size.  We should try to get
295  // value from the reqd_work_group_size function attribute if it is
296  // available.
297  unsigned WorkGroupSize = 256;
298  int AllocaSize =
299      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
300
301  if (AllocaSize > LocalMemAvailable) {
302    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
303    return;
304  }
305
306  std::vector<Value*> WorkList;
307
308  if (!collectUsesWithPtrTypes(&I, WorkList)) {
309    DEBUG(dbgs() << " Do not know how to convert all uses\n");
310    return;
311  }
312
313  DEBUG(dbgs() << "Promoting alloca to local memory\n");
314  LocalMemAvailable -= AllocaSize;
315
316  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
317  GlobalVariable *GV = new GlobalVariable(
318      *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
319      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
320
321  FunctionType *FTy = FunctionType::get(
322      Type::getInt32Ty(Mod->getContext()), false);
323  AttributeSet AttrSet;
324  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
325
326  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
327      "llvm.r600.read.local.size.y", FTy, AttrSet);
328  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
329      "llvm.r600.read.local.size.z", FTy, AttrSet);
330  Value *ReadTIDIGX = Mod->getOrInsertFunction(
331      "llvm.r600.read.tidig.x", FTy, AttrSet);
332  Value *ReadTIDIGY = Mod->getOrInsertFunction(
333      "llvm.r600.read.tidig.y", FTy, AttrSet);
334  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
335      "llvm.r600.read.tidig.z", FTy, AttrSet);
336
337  Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
338  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
339  Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
340  Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
341  Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
342
343  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
344  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
345  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
346  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
347  TID = Builder.CreateAdd(TID, TIdZ);
348
349  std::vector<Value*> Indices;
350  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
351  Indices.push_back(TID);
352
353  Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
354  I.mutateType(Offset->getType());
355  I.replaceAllUsesWith(Offset);
356  I.eraseFromParent();
357
358  for (std::vector<Value*>::iterator i = WorkList.begin(),
359                                     e = WorkList.end(); i != e; ++i) {
360    Value *V = *i;
361    CallInst *Call = dyn_cast<CallInst>(V);
362    if (!Call) {
363      Type *EltTy = V->getType()->getPointerElementType();
364      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
365
366      // The operand's value should be corrected on its own.
367      if (isa<AddrSpaceCastInst>(V))
368        continue;
369
370      // FIXME: It doesn't really make sense to try to do this for all
371      // instructions.
372      V->mutateType(NewTy);
373      continue;
374    }
375
376    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
377    if (!Intr) {
378      std::vector<Type*> ArgTypes;
379      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
380                                ArgIdx != ArgEnd; ++ArgIdx) {
381        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
382      }
383      Function *F = Call->getCalledFunction();
384      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
385                                                F->isVarArg());
386      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
387                                             NewType, F->getAttributes());
388      Function *NewF = cast<Function>(C);
389      Call->setCalledFunction(NewF);
390      continue;
391    }
392
393    Builder.SetInsertPoint(Intr);
394    switch (Intr->getIntrinsicID()) {
395    case Intrinsic::lifetime_start:
396    case Intrinsic::lifetime_end:
397      // These intrinsics are for address space 0 only
398      Intr->eraseFromParent();
399      continue;
400    case Intrinsic::memcpy: {
401      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
402      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
403                           MemCpy->getLength(), MemCpy->getAlignment(),
404                           MemCpy->isVolatile());
405      Intr->eraseFromParent();
406      continue;
407    }
408    case Intrinsic::memset: {
409      MemSetInst *MemSet = cast<MemSetInst>(Intr);
410      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
411                           MemSet->getLength(), MemSet->getAlignment(),
412                           MemSet->isVolatile());
413      Intr->eraseFromParent();
414      continue;
415    }
416    default:
417      Intr->dump();
418      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
419    }
420  }
421}
422
423FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
424  return new AMDGPUPromoteAlloca(ST);
425}
426