1//===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//==-----------------------------------------------------------------------===//
9
10#include "AMDILDevices.h"
11#include "AMDGPUInstrInfo.h"
12#include "llvm/ADT/Statistic.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/ADT/Twine.h"
16#include "llvm/Constants.h"
17#include "llvm/CodeGen/MachineFunction.h"
18#include "llvm/CodeGen/MachineFunctionAnalysis.h"
19#include "llvm/Function.h"
20#include "llvm/Instructions.h"
21#include "llvm/Module.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/Support/MathExtras.h"
24
25#include <sstream>
26
27#if 0
28STATISTIC(PointerAssignments, "Number of dynamic pointer "
29    "assigments discovered");
30STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
31#endif
32
33using namespace llvm;
34// The Peephole optimization pass is used to do simple last minute optimizations
35// that are required for correct code or to remove redundant functions
36namespace {
37
38class OpaqueType;
39
40class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
41public:
42  TargetMachine &TM;
43  static char ID;
44  AMDGPUPeepholeOpt(TargetMachine &tm);
45  ~AMDGPUPeepholeOpt();
46  const char *getPassName() const;
47  bool runOnFunction(Function &F);
48  bool doInitialization(Module &M);
49  bool doFinalization(Module &M);
50  void getAnalysisUsage(AnalysisUsage &AU) const;
51protected:
52private:
53  // Function to initiate all of the instruction level optimizations.
54  bool instLevelOptimizations(BasicBlock::iterator *inst);
55  // Quick check to see if we need to dump all of the pointers into the
56  // arena. If this is correct, then we set all pointers to exist in arena. This
57  // is a workaround for aliasing of pointers in a struct/union.
58  bool dumpAllIntoArena(Function &F);
59  // Because I don't want to invalidate any pointers while in the
60  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
61  // it later. This function does the conversions if required.
62  void doAtomicConversionIfNeeded(Function &F);
63  // Because __amdil_is_constant cannot be properly evaluated if
64  // optimizations are disabled, the call's are placed in a vector
65  // and evaluated after the __amdil_image* functions are evaluated
66  // which should allow the __amdil_is_constant function to be
67  // evaluated correctly.
68  void doIsConstCallConversionIfNeeded();
69  bool mChanged;
70  bool mDebug;
71  bool mConvertAtomics;
72  CodeGenOpt::Level optLevel;
73  // Run a series of tests to see if we can optimize a CALL instruction.
74  bool optimizeCallInst(BasicBlock::iterator *bbb);
75  // A peephole optimization to optimize bit extract sequences.
76  bool optimizeBitExtract(Instruction *inst);
77  // A peephole optimization to optimize bit insert sequences.
78  bool optimizeBitInsert(Instruction *inst);
79  bool setupBitInsert(Instruction *base,
80                      Instruction *&src,
81                      Constant *&mask,
82                      Constant *&shift);
83  // Expand the bit field insert instruction on versions of OpenCL that
84  // don't support it.
85  bool expandBFI(CallInst *CI);
86  // Expand the bit field mask instruction on version of OpenCL that
87  // don't support it.
88  bool expandBFM(CallInst *CI);
89  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
90  // this case we need to expand them. These functions check for 24bit functions
91  // and then expand.
92  bool isSigned24BitOps(CallInst *CI);
93  void expandSigned24BitOps(CallInst *CI);
94  // One optimization that can occur is that if the required workgroup size is
95  // specified then the result of get_local_size is known at compile time and
96  // can be returned accordingly.
97  bool isRWGLocalOpt(CallInst *CI);
98  // On northern island cards, the division is slightly less accurate than on
99  // previous generations, so we need to utilize a more accurate division. So we
100  // can translate the accurate divide to a normal divide on all other cards.
101  bool convertAccurateDivide(CallInst *CI);
102  void expandAccurateDivide(CallInst *CI);
103  // If the alignment is set incorrectly, it can produce really inefficient
104  // code. This checks for this scenario and fixes it if possible.
105  bool correctMisalignedMemOp(Instruction *inst);
106
107  // If we are in no opt mode, then we need to make sure that
108  // local samplers are properly propagated as constant propagation
109  // doesn't occur and we need to know the value of kernel defined
110  // samplers at compile time.
111  bool propagateSamplerInst(CallInst *CI);
112
113  // Helper functions
114
115  // Group of functions that recursively calculate the size of a structure based
116  // on it's sub-types.
117  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
118  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
119  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
120  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
121  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
122  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
123  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
124  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
125
126  LLVMContext *mCTX;
127  Function *mF;
128  const AMDGPUSubtarget *mSTM;
129  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
130  SmallVector<CallInst *, 16> isConstVec;
131}; // class AMDGPUPeepholeOpt
132  char AMDGPUPeepholeOpt::ID = 0;
133
134// A template function that has two levels of looping before calling the
135// function with a pointer to the current iterator.
136template<class InputIterator, class SecondIterator, class Function>
137Function safeNestedForEach(InputIterator First, InputIterator Last,
138                              SecondIterator S, Function F)
139{
140  for ( ; First != Last; ++First) {
141    SecondIterator sf, sl;
142    for (sf = First->begin(), sl = First->end();
143         sf != sl; )  {
144      if (!F(&sf)) {
145        ++sf;
146      }
147    }
148  }
149  return F;
150}
151
152} // anonymous namespace
153
154namespace llvm {
155  FunctionPass *
156  createAMDGPUPeepholeOpt(TargetMachine &tm)
157  {
158    return new AMDGPUPeepholeOpt(tm);
159  }
160} // llvm namespace
161
162AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
163  : FunctionPass(ID), TM(tm)
164{
165  mDebug = false;
166  optLevel = TM.getOptLevel();
167
168}
169
170AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
171{
172}
173
174const char *
175AMDGPUPeepholeOpt::getPassName() const
176{
177  return "AMDGPU PeepHole Optimization Pass";
178}
179
180bool
181containsPointerType(Type *Ty)
182{
183  if (!Ty) {
184    return false;
185  }
186  switch(Ty->getTypeID()) {
187  default:
188    return false;
189  case Type::StructTyID: {
190    const StructType *ST = dyn_cast<StructType>(Ty);
191    for (StructType::element_iterator stb = ST->element_begin(),
192           ste = ST->element_end(); stb != ste; ++stb) {
193      if (!containsPointerType(*stb)) {
194        continue;
195      }
196      return true;
197    }
198    break;
199  }
200  case Type::VectorTyID:
201  case Type::ArrayTyID:
202    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
203  case Type::PointerTyID:
204    return true;
205  };
206  return false;
207}
208
209bool
210AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)
211{
212  bool dumpAll = false;
213  for (Function::const_arg_iterator cab = F.arg_begin(),
214       cae = F.arg_end(); cab != cae; ++cab) {
215    const Argument *arg = cab;
216    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
217    if (!PT) {
218      continue;
219    }
220    Type *DereferencedType = PT->getElementType();
221    if (!dyn_cast<StructType>(DereferencedType)
222        ) {
223      continue;
224    }
225    if (!containsPointerType(DereferencedType)) {
226      continue;
227    }
228    // FIXME: Because a pointer inside of a struct/union may be aliased to
229    // another pointer we need to take the conservative approach and place all
230    // pointers into the arena until more advanced detection is implemented.
231    dumpAll = true;
232  }
233  return dumpAll;
234}
235void
236AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
237{
238  if (isConstVec.empty()) {
239    return;
240  }
241  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
242    CallInst *CI = isConstVec[x];
243    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
244    Type *aType = Type::getInt32Ty(*mCTX);
245    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
246      : ConstantInt::get(aType, 0);
247    CI->replaceAllUsesWith(Val);
248    CI->eraseFromParent();
249  }
250  isConstVec.clear();
251}
252void
253AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
254{
255  // Don't do anything if we don't have any atomic operations.
256  if (atomicFuncs.empty()) {
257    return;
258  }
259  // Change the function name for the atomic if it is required
260  uint32_t size = atomicFuncs.size();
261  for (uint32_t x = 0; x < size; ++x) {
262    atomicFuncs[x].first->setOperand(
263        atomicFuncs[x].first->getNumOperands()-1,
264        atomicFuncs[x].second);
265
266  }
267  mChanged = true;
268  if (mConvertAtomics) {
269    return;
270  }
271}
272
273bool
274AMDGPUPeepholeOpt::runOnFunction(Function &MF)
275{
276  mChanged = false;
277  mF = &MF;
278  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
279  if (mDebug) {
280    MF.dump();
281  }
282  mCTX = &MF.getType()->getContext();
283  mConvertAtomics = true;
284  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
285     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
286                  this));
287
288  doAtomicConversionIfNeeded(MF);
289  doIsConstCallConversionIfNeeded();
290
291  if (mDebug) {
292    MF.dump();
293  }
294  return mChanged;
295}
296
297bool
298AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
299{
300  Instruction *inst = (*bbb);
301  CallInst *CI = dyn_cast<CallInst>(inst);
302  if (!CI) {
303    return false;
304  }
305  if (isSigned24BitOps(CI)) {
306    expandSigned24BitOps(CI);
307    ++(*bbb);
308    CI->eraseFromParent();
309    return true;
310  }
311  if (propagateSamplerInst(CI)) {
312    return false;
313  }
314  if (expandBFI(CI) || expandBFM(CI)) {
315    ++(*bbb);
316    CI->eraseFromParent();
317    return true;
318  }
319  if (convertAccurateDivide(CI)) {
320    expandAccurateDivide(CI);
321    ++(*bbb);
322    CI->eraseFromParent();
323    return true;
324  }
325
326  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
327  if (calleeName.startswith("__amdil_is_constant")) {
328    // If we do not have optimizations, then this
329    // cannot be properly evaluated, so we add the
330    // call instruction to a vector and process
331    // them at the end of processing after the
332    // samplers have been correctly handled.
333    if (optLevel == CodeGenOpt::None) {
334      isConstVec.push_back(CI);
335      return false;
336    } else {
337      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
338      Type *aType = Type::getInt32Ty(*mCTX);
339      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
340        : ConstantInt::get(aType, 0);
341      CI->replaceAllUsesWith(Val);
342      ++(*bbb);
343      CI->eraseFromParent();
344      return true;
345    }
346  }
347
348  if (calleeName.equals("__amdil_is_asic_id_i32")) {
349    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
350    Type *aType = Type::getInt32Ty(*mCTX);
351    Value *Val = CV;
352    if (Val) {
353      Val = ConstantInt::get(aType,
354          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
355    } else {
356      Val = ConstantInt::get(aType, 0);
357    }
358    CI->replaceAllUsesWith(Val);
359    ++(*bbb);
360    CI->eraseFromParent();
361    return true;
362  }
363  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
364  if (!F) {
365    return false;
366  }
367  if (F->getName().startswith("__atom") && !CI->getNumUses()
368      && F->getName().find("_xchg") == StringRef::npos) {
369    std::string buffer(F->getName().str() + "_noret");
370    F = dyn_cast<Function>(
371          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
372    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
373  }
374
375  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
376      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
377    return false;
378  }
379  if (!mConvertAtomics) {
380    return false;
381  }
382  StringRef name = F->getName();
383  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
384    mConvertAtomics = false;
385  }
386  return false;
387}
388
389bool
390AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
391    Instruction *&src,
392    Constant *&mask,
393    Constant *&shift)
394{
395  if (!base) {
396    if (mDebug) {
397      dbgs() << "Null pointer passed into function.\n";
398    }
399    return false;
400  }
401  bool andOp = false;
402  if (base->getOpcode() == Instruction::Shl) {
403    shift = dyn_cast<Constant>(base->getOperand(1));
404  } else if (base->getOpcode() == Instruction::And) {
405    mask = dyn_cast<Constant>(base->getOperand(1));
406    andOp = true;
407  } else {
408    if (mDebug) {
409      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
410    }
411    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
412    return false;
413  }
414  src = dyn_cast<Instruction>(base->getOperand(0));
415  if (!src) {
416    if (mDebug) {
417      dbgs() << "Failed setup since the base operand is not an instruction!\n";
418    }
419    return false;
420  }
421  // If we find an 'and' operation, then we don't need to
422  // find the next operation as we already know the
423  // bits that are valid at this point.
424  if (andOp) {
425    return true;
426  }
427  if (src->getOpcode() == Instruction::Shl && !shift) {
428    shift = dyn_cast<Constant>(src->getOperand(1));
429    src = dyn_cast<Instruction>(src->getOperand(0));
430  } else if (src->getOpcode() == Instruction::And && !mask) {
431    mask = dyn_cast<Constant>(src->getOperand(1));
432  }
433  if (!mask && !shift) {
434    if (mDebug) {
435      dbgs() << "Failed setup since both mask and shift are NULL!\n";
436    }
437    // Did not find a constant mask or a shift.
438    return false;
439  }
440  return true;
441}
442bool
443AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)
444{
445  if (!inst) {
446    return false;
447  }
448  if (!inst->isBinaryOp()) {
449    return false;
450  }
451  if (inst->getOpcode() != Instruction::Or) {
452    return false;
453  }
454  if (optLevel == CodeGenOpt::None) {
455    return false;
456  }
457  // We want to do an optimization on a sequence of ops that in the end equals a
458  // single ISA instruction.
459  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
460  // Some simplified versions of this pattern are as follows:
461  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
462  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
463  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
464  // (A & B) | (D << F) when (1 << F) >= B
465  // (A << C) | (D & E) when (1 << C) >= E
466  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
467    // The HD4XXX hardware doesn't support the ubit_insert instruction.
468    return false;
469  }
470  Type *aType = inst->getType();
471  bool isVector = aType->isVectorTy();
472  int numEle = 1;
473  // This optimization only works on 32bit integers.
474  if (aType->getScalarType()
475      != Type::getInt32Ty(inst->getContext())) {
476    return false;
477  }
478  if (isVector) {
479    const VectorType *VT = dyn_cast<VectorType>(aType);
480    numEle = VT->getNumElements();
481    // We currently cannot support more than 4 elements in a intrinsic and we
482    // cannot support Vec3 types.
483    if (numEle > 4 || numEle == 3) {
484      return false;
485    }
486  }
487  // TODO: Handle vectors.
488  if (isVector) {
489    if (mDebug) {
490      dbgs() << "!!! Vectors are not supported yet!\n";
491    }
492    return false;
493  }
494  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
495  Constant *LHSMask = NULL, *RHSMask = NULL;
496  Constant *LHSShift = NULL, *RHSShift = NULL;
497  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
498  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
499  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
500    if (mDebug) {
501      dbgs() << "Found an OR Operation that failed setup!\n";
502      inst->dump();
503      if (LHS) { LHS->dump(); }
504      if (LHSSrc) { LHSSrc->dump(); }
505      if (LHSMask) { LHSMask->dump(); }
506      if (LHSShift) { LHSShift->dump(); }
507    }
508    // There was an issue with the setup for BitInsert.
509    return false;
510  }
511  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
512    if (mDebug) {
513      dbgs() << "Found an OR Operation that failed setup!\n";
514      inst->dump();
515      if (RHS) { RHS->dump(); }
516      if (RHSSrc) { RHSSrc->dump(); }
517      if (RHSMask) { RHSMask->dump(); }
518      if (RHSShift) { RHSShift->dump(); }
519    }
520    // There was an issue with the setup for BitInsert.
521    return false;
522  }
523  if (mDebug) {
524    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
525    dbgs() << "Op:        "; inst->dump();
526    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
527    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
528    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
529    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
530    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
531    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
532    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
533    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
534  }
535  Constant *offset = NULL;
536  Constant *width = NULL;
537  int32_t lhsMaskVal = 0, rhsMaskVal = 0;
538  int32_t lhsShiftVal = 0, rhsShiftVal = 0;
539  int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
540  int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
541  lhsMaskVal = (int32_t)(LHSMask
542      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
543  rhsMaskVal = (int32_t)(RHSMask
544      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
545  lhsShiftVal = (int32_t)(LHSShift
546      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
547  rhsShiftVal = (int32_t)(RHSShift
548      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
549  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
550  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
551  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
552  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
553  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
554  if (mDebug) {
555      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
556      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
557      dbgs() << (RHSMask ? " & E)" : ")");
558      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
559      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
560      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
561      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
562      dbgs() << "width(B) = " << lhsMaskWidth;
563      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
564      dbgs() << "offset(B) = " << lhsMaskOffset;
565      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
566      dbgs() << "Constraints: \n";
567      dbgs() << "\t(1) B ^ E == 0\n";
568      dbgs() << "\t(2-LHS) B is a mask\n";
569      dbgs() << "\t(2-LHS) E is a mask\n";
570      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
571      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
572  }
573  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
574    if (mDebug) {
575      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
576      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
577      dbgs() << "Failed constraint 1!\n";
578    }
579    return false;
580  }
581  if (mDebug) {
582    dbgs() << "LHS = " << lhsMaskOffset << "";
583    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
584    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
585    dbgs() << "\nRHS = " << rhsMaskOffset << "";
586    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
587    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
588    dbgs() << "\n";
589  }
590  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
591    offset = ConstantInt::get(aType, lhsMaskOffset, false);
592    width = ConstantInt::get(aType, lhsMaskWidth, false);
593    RHSSrc = RHS;
594    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
595      if (mDebug) {
596        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
597        dbgs() << "Failed constraint 2!\n";
598      }
599      return false;
600    }
601    if (!LHSShift) {
602      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
603          "MaskShr", LHS);
604    } else if (lhsShiftVal != lhsMaskOffset) {
605      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
606          "MaskShr", LHS);
607    }
608    if (mDebug) {
609      dbgs() << "Optimizing LHS!\n";
610    }
611  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
612    offset = ConstantInt::get(aType, rhsMaskOffset, false);
613    width = ConstantInt::get(aType, rhsMaskWidth, false);
614    LHSSrc = RHSSrc;
615    RHSSrc = LHS;
616    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
617      if (mDebug) {
618        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
619        dbgs() << "Failed constraint 2!\n";
620      }
621      return false;
622    }
623    if (!RHSShift) {
624      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
625          "MaskShr", RHS);
626    } else if (rhsShiftVal != rhsMaskOffset) {
627      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
628          "MaskShr", RHS);
629    }
630    if (mDebug) {
631      dbgs() << "Optimizing RHS!\n";
632    }
633  } else {
634    if (mDebug) {
635      dbgs() << "Failed constraint 3!\n";
636    }
637    return false;
638  }
639  if (mDebug) {
640    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
641    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
642    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
643    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
644  }
645  if (!offset || !width) {
646    if (mDebug) {
647      dbgs() << "Either width or offset are NULL, failed detection!\n";
648    }
649    return false;
650  }
651  // Lets create the function signature.
652  std::vector<Type *> callTypes;
653  callTypes.push_back(aType);
654  callTypes.push_back(aType);
655  callTypes.push_back(aType);
656  callTypes.push_back(aType);
657  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
658  std::string name = "__amdil_ubit_insert";
659  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
660  Function *Func =
661    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
662        getOrInsertFunction(llvm::StringRef(name), funcType));
663  Value *Operands[4] = {
664    width,
665    offset,
666    LHSSrc,
667    RHSSrc
668  };
669  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
670  if (mDebug) {
671    dbgs() << "Old Inst: ";
672    inst->dump();
673    dbgs() << "New Inst: ";
674    CI->dump();
675    dbgs() << "\n\n";
676  }
677  CI->insertBefore(inst);
678  inst->replaceAllUsesWith(CI);
679  return true;
680}
681
682bool
683AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)
684{
685  if (!inst) {
686    return false;
687  }
688  if (!inst->isBinaryOp()) {
689    return false;
690  }
691  if (inst->getOpcode() != Instruction::And) {
692    return false;
693  }
694  if (optLevel == CodeGenOpt::None) {
695    return false;
696  }
697  // We want to do some simple optimizations on Shift right/And patterns. The
698  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
699  // value smaller than 32 and C is a mask. If C is a constant value, then the
700  // following transformation can occur. For signed integers, it turns into the
701  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
702  // integers, it turns into the function call dst =
703  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
704  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
705  // Evergreen hardware.
706  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
707    // This does not work on HD4XXX hardware.
708    return false;
709  }
710  Type *aType = inst->getType();
711  bool isVector = aType->isVectorTy();
712
713  // XXX Support vector types
714  if (isVector) {
715    return false;
716  }
717  int numEle = 1;
718  // This only works on 32bit integers
719  if (aType->getScalarType()
720      != Type::getInt32Ty(inst->getContext())) {
721    return false;
722  }
723  if (isVector) {
724    const VectorType *VT = dyn_cast<VectorType>(aType);
725    numEle = VT->getNumElements();
726    // We currently cannot support more than 4 elements in a intrinsic and we
727    // cannot support Vec3 types.
728    if (numEle > 4 || numEle == 3) {
729      return false;
730    }
731  }
732  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
733  // If the first operand is not a shift instruction, then we can return as it
734  // doesn't match this pattern.
735  if (!ShiftInst || !ShiftInst->isShift()) {
736    return false;
737  }
738  // If we are a shift left, then we need don't match this pattern.
739  if (ShiftInst->getOpcode() == Instruction::Shl) {
740    return false;
741  }
742  bool isSigned = ShiftInst->isArithmeticShift();
743  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
744  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
745  // Lets make sure that the shift value and the and mask are constant integers.
746  if (!AndMask || !ShrVal) {
747    return false;
748  }
749  Constant *newMaskConst;
750  Constant *shiftValConst;
751  if (isVector) {
752    // Handle the vector case
753    std::vector<Constant *> maskVals;
754    std::vector<Constant *> shiftVals;
755    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
756    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
757    Type *scalarType = AndMaskVec->getType()->getScalarType();
758    assert(AndMaskVec->getNumOperands() ==
759           ShrValVec->getNumOperands() && "cannot have a "
760           "combination where the number of elements to a "
761           "shift and an and are different!");
762    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
763      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
764      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
765      if (!AndCI || !ShiftIC) {
766        return false;
767      }
768      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
769      if (!isMask_32(maskVal)) {
770        return false;
771      }
772      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
773      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
774      // If the mask or shiftval is greater than the bitcount, then break out.
775      if (maskVal >= 32 || shiftVal >= 32) {
776        return false;
777      }
778      // If the mask val is greater than the the number of original bits left
779      // then this optimization is invalid.
780      if (maskVal > (32 - shiftVal)) {
781        return false;
782      }
783      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
784      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
785    }
786    newMaskConst = ConstantVector::get(maskVals);
787    shiftValConst = ConstantVector::get(shiftVals);
788  } else {
789    // Handle the scalar case
790    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
791    // This must be a mask value where all lower bits are set to 1 and then any
792    // bit higher is set to 0.
793    if (!isMask_32(maskVal)) {
794      return false;
795    }
796    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
797    // Count the number of bits set in the mask, this is the width of the
798    // resulting bit set that is extracted from the source value.
799    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
800    // If the mask or shift val is greater than the bitcount, then break out.
801    if (maskVal >= 32 || shiftVal >= 32) {
802      return false;
803    }
804    // If the mask val is greater than the the number of original bits left then
805    // this optimization is invalid.
806    if (maskVal > (32 - shiftVal)) {
807      return false;
808    }
809    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
810    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
811  }
812  // Lets create the function signature.
813  std::vector<Type *> callTypes;
814  callTypes.push_back(aType);
815  callTypes.push_back(aType);
816  callTypes.push_back(aType);
817  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
818  std::string name = "llvm.AMDIL.bit.extract.u32";
819  if (isVector) {
820    name += ".v" + itostr(numEle) + "i32";
821  } else {
822    name += ".";
823  }
824  // Lets create the function.
825  Function *Func =
826    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
827                       getOrInsertFunction(llvm::StringRef(name), funcType));
828  Value *Operands[3] = {
829    ShiftInst->getOperand(0),
830    shiftValConst,
831    newMaskConst
832  };
833  // Lets create the Call with the operands
834  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
835  CI->setDoesNotAccessMemory();
836  CI->insertBefore(inst);
837  inst->replaceAllUsesWith(CI);
838  return true;
839}
840
841bool
842AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
843{
844  if (!CI) {
845    return false;
846  }
847  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
848  if (!LHS->getName().startswith("__amdil_bfi")) {
849    return false;
850  }
851  Type* type = CI->getOperand(0)->getType();
852  Constant *negOneConst = NULL;
853  if (type->isVectorTy()) {
854    std::vector<Constant *> negOneVals;
855    negOneConst = ConstantInt::get(CI->getContext(),
856        APInt(32, StringRef("-1"), 10));
857    for (size_t x = 0,
858        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
859      negOneVals.push_back(negOneConst);
860    }
861    negOneConst = ConstantVector::get(negOneVals);
862  } else {
863    negOneConst = ConstantInt::get(CI->getContext(),
864        APInt(32, StringRef("-1"), 10));
865  }
866  // __amdil_bfi => (A & B) | (~A & C)
867  BinaryOperator *lhs =
868    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
869        CI->getOperand(1), "bfi_and", CI);
870  BinaryOperator *rhs =
871    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
872        "bfi_not", CI);
873  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
874      "bfi_and", CI);
875  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
876  CI->replaceAllUsesWith(lhs);
877  return true;
878}
879
880bool
881AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
882{
883  if (!CI) {
884    return false;
885  }
886  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
887  if (!LHS->getName().startswith("__amdil_bfm")) {
888    return false;
889  }
890  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
891  Constant *newMaskConst = NULL;
892  Constant *newShiftConst = NULL;
893  Type* type = CI->getOperand(0)->getType();
894  if (type->isVectorTy()) {
895    std::vector<Constant*> newMaskVals, newShiftVals;
896    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
897    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
898    for (size_t x = 0,
899        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
900      newMaskVals.push_back(newMaskConst);
901      newShiftVals.push_back(newShiftConst);
902    }
903    newMaskConst = ConstantVector::get(newMaskVals);
904    newShiftConst = ConstantVector::get(newShiftVals);
905  } else {
906    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
907    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
908  }
909  BinaryOperator *lhs =
910    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
911        newMaskConst, "bfm_mask", CI);
912  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
913      lhs, "bfm_shl", CI);
914  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
915      newShiftConst, "bfm_sub", CI);
916  BinaryOperator *rhs =
917    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
918        newMaskConst, "bfm_mask", CI);
919  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
920  CI->replaceAllUsesWith(lhs);
921  return true;
922}
923
924bool
925AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
926{
927  Instruction *inst = (*bbb);
928  if (optimizeCallInst(bbb)) {
929    return true;
930  }
931  if (optimizeBitExtract(inst)) {
932    return false;
933  }
934  if (optimizeBitInsert(inst)) {
935    return false;
936  }
937  if (correctMisalignedMemOp(inst)) {
938    return false;
939  }
940  return false;
941}
942bool
943AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
944{
945  LoadInst *linst = dyn_cast<LoadInst>(inst);
946  StoreInst *sinst = dyn_cast<StoreInst>(inst);
947  unsigned alignment;
948  Type* Ty = inst->getType();
949  if (linst) {
950    alignment = linst->getAlignment();
951    Ty = inst->getType();
952  } else if (sinst) {
953    alignment = sinst->getAlignment();
954    Ty = sinst->getValueOperand()->getType();
955  } else {
956    return false;
957  }
958  unsigned size = getTypeSize(Ty);
959  if (size == alignment || size < alignment) {
960    return false;
961  }
962  if (!Ty->isStructTy()) {
963    return false;
964  }
965  if (alignment < 4) {
966    if (linst) {
967      linst->setAlignment(0);
968      return true;
969    } else if (sinst) {
970      sinst->setAlignment(0);
971      return true;
972    }
973  }
974  return false;
975}
976bool
977AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)
978{
979  if (!CI) {
980    return false;
981  }
982  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
983  std::string namePrefix = LHS->getName().substr(0, 14);
984  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
985      && namePrefix != "__amdil__imul24_high") {
986    return false;
987  }
988  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
989    return false;
990  }
991  return true;
992}
993
994void
995AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)
996{
997  assert(isSigned24BitOps(CI) && "Must be a "
998      "signed 24 bit operation to call this function!");
999  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1000  // On 7XX and 8XX we do not have signed 24bit, so we need to
1001  // expand it to the following:
1002  // imul24 turns into 32bit imul
1003  // imad24 turns into 32bit imad
1004  // imul24_high turns into 32bit imulhigh
1005  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1006    Type *aType = CI->getOperand(0)->getType();
1007    bool isVector = aType->isVectorTy();
1008    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1009    std::vector<Type*> callTypes;
1010    callTypes.push_back(CI->getOperand(0)->getType());
1011    callTypes.push_back(CI->getOperand(1)->getType());
1012    callTypes.push_back(CI->getOperand(2)->getType());
1013    FunctionType *funcType =
1014      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1015    std::string name = "__amdil_imad";
1016    if (isVector) {
1017      name += "_v" + itostr(numEle) + "i32";
1018    } else {
1019      name += "_i32";
1020    }
1021    Function *Func = dyn_cast<Function>(
1022                       CI->getParent()->getParent()->getParent()->
1023                       getOrInsertFunction(llvm::StringRef(name), funcType));
1024    Value *Operands[3] = {
1025      CI->getOperand(0),
1026      CI->getOperand(1),
1027      CI->getOperand(2)
1028    };
1029    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1030    nCI->insertBefore(CI);
1031    CI->replaceAllUsesWith(nCI);
1032  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1033    BinaryOperator *mulOp =
1034      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1035          CI->getOperand(1), "imul24", CI);
1036    CI->replaceAllUsesWith(mulOp);
1037  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1038    Type *aType = CI->getOperand(0)->getType();
1039
1040    bool isVector = aType->isVectorTy();
1041    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1042    std::vector<Type*> callTypes;
1043    callTypes.push_back(CI->getOperand(0)->getType());
1044    callTypes.push_back(CI->getOperand(1)->getType());
1045    FunctionType *funcType =
1046      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1047    std::string name = "__amdil_imul_high";
1048    if (isVector) {
1049      name += "_v" + itostr(numEle) + "i32";
1050    } else {
1051      name += "_i32";
1052    }
1053    Function *Func = dyn_cast<Function>(
1054                       CI->getParent()->getParent()->getParent()->
1055                       getOrInsertFunction(llvm::StringRef(name), funcType));
1056    Value *Operands[2] = {
1057      CI->getOperand(0),
1058      CI->getOperand(1)
1059    };
1060    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1061    nCI->insertBefore(CI);
1062    CI->replaceAllUsesWith(nCI);
1063  }
1064}
1065
1066bool
1067AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1068{
1069  return (CI != NULL
1070          && CI->getOperand(CI->getNumOperands() - 1)->getName()
1071          == "__amdil_get_local_size_int");
1072}
1073
1074bool
1075AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)
1076{
1077  if (!CI) {
1078    return false;
1079  }
1080  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1081      && (mSTM->getDeviceName() == "cayman")) {
1082    return false;
1083  }
1084  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1085      == "__amdil_improved_div";
1086}
1087
1088void
1089AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)
1090{
1091  assert(convertAccurateDivide(CI)
1092         && "expanding accurate divide can only happen if it is expandable!");
1093  BinaryOperator *divOp =
1094    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1095                           CI->getOperand(1), "fdiv32", CI);
1096  CI->replaceAllUsesWith(divOp);
1097}
1098
1099bool
1100AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
1101{
1102  if (optLevel != CodeGenOpt::None) {
1103    return false;
1104  }
1105
1106  if (!CI) {
1107    return false;
1108  }
1109
1110  unsigned funcNameIdx = 0;
1111  funcNameIdx = CI->getNumOperands() - 1;
1112  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1113  if (calleeName != "__amdil_image2d_read_norm"
1114   && calleeName != "__amdil_image2d_read_unnorm"
1115   && calleeName != "__amdil_image3d_read_norm"
1116   && calleeName != "__amdil_image3d_read_unnorm") {
1117    return false;
1118  }
1119
1120  unsigned samplerIdx = 2;
1121  samplerIdx = 1;
1122  Value *sampler = CI->getOperand(samplerIdx);
1123  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1124  if (!lInst) {
1125    return false;
1126  }
1127
1128  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1129    return false;
1130  }
1131
1132  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1133  // If we are loading from what is not a global value, then we
1134  // fail and return.
1135  if (!gv) {
1136    return false;
1137  }
1138
1139  // If we don't have an initializer or we have an initializer and
1140  // the initializer is not a 32bit integer, we fail.
1141  if (!gv->hasInitializer()
1142      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1143      return false;
1144  }
1145
1146  // Now that we have the global variable initializer, lets replace
1147  // all uses of the load instruction with the samplerVal and
1148  // reparse the __amdil_is_constant() function.
1149  Constant *samplerVal = gv->getInitializer();
1150  lInst->replaceAllUsesWith(samplerVal);
1151  return true;
1152}
1153
1154bool
1155AMDGPUPeepholeOpt::doInitialization(Module &M)
1156{
1157  return false;
1158}
1159
1160bool
1161AMDGPUPeepholeOpt::doFinalization(Module &M)
1162{
1163  return false;
1164}
1165
1166void
1167AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1168{
1169  AU.addRequired<MachineFunctionAnalysis>();
1170  FunctionPass::getAnalysisUsage(AU);
1171  AU.setPreservesAll();
1172}
1173
1174size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1175  size_t size = 0;
1176  if (!T) {
1177    return size;
1178  }
1179  switch (T->getTypeID()) {
1180  case Type::X86_FP80TyID:
1181  case Type::FP128TyID:
1182  case Type::PPC_FP128TyID:
1183  case Type::LabelTyID:
1184    assert(0 && "These types are not supported by this backend");
1185  default:
1186  case Type::FloatTyID:
1187  case Type::DoubleTyID:
1188    size = T->getPrimitiveSizeInBits() >> 3;
1189    break;
1190  case Type::PointerTyID:
1191    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1192    break;
1193  case Type::IntegerTyID:
1194    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1195    break;
1196  case Type::StructTyID:
1197    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1198    break;
1199  case Type::ArrayTyID:
1200    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1201    break;
1202  case Type::FunctionTyID:
1203    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1204    break;
1205  case Type::VectorTyID:
1206    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1207    break;
1208  };
1209  return size;
1210}
1211
1212size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1213    bool dereferencePtr) {
1214  size_t size = 0;
1215  if (!ST) {
1216    return size;
1217  }
1218  Type *curType;
1219  StructType::element_iterator eib;
1220  StructType::element_iterator eie;
1221  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1222    curType = *eib;
1223    size += getTypeSize(curType, dereferencePtr);
1224  }
1225  return size;
1226}
1227
1228size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1229    bool dereferencePtr) {
1230  return IT ? (IT->getBitWidth() >> 3) : 0;
1231}
1232
1233size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1234    bool dereferencePtr) {
1235    assert(0 && "Should not be able to calculate the size of an function type");
1236    return 0;
1237}
1238
1239size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1240    bool dereferencePtr) {
1241  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1242                                    dereferencePtr) * AT->getNumElements())
1243                     : 0);
1244}
1245
1246size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1247    bool dereferencePtr) {
1248  return VT ? (VT->getBitWidth() >> 3) : 0;
1249}
1250
1251size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1252    bool dereferencePtr) {
1253  if (!PT) {
1254    return 0;
1255  }
1256  Type *CT = PT->getElementType();
1257  if (CT->getTypeID() == Type::StructTyID &&
1258      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1259    return getTypeSize(dyn_cast<StructType>(CT));
1260  } else if (dereferencePtr) {
1261    size_t size = 0;
1262    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1263      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1264    }
1265    return size;
1266  } else {
1267    return 4;
1268  }
1269}
1270
1271size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1272    bool dereferencePtr) {
1273  //assert(0 && "Should not be able to calculate the size of an opaque type");
1274  return 4;
1275}
1276