AMDILPeepholeOptimizer.cpp revision a75c6163e605f35b14f26930dd9227e4f337ec9e
1//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//==-----------------------------------------------------------------------===//
9
10#define DEBUG_TYPE "PeepholeOpt"
11#ifdef DEBUG
12#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
13#else
14#define DEBUGME 0
15#endif
16
17#include "AMDILAlgorithms.tpp"
18#include "AMDILDevices.h"
19#include "AMDILGlobalManager.h"
20#include "AMDILKernelManager.h"
21#include "AMDILMachineFunctionInfo.h"
22#include "AMDILUtilityFunctions.h"
23#include "llvm/ADT/Statistic.h"
24#include "llvm/ADT/StringExtras.h"
25#include "llvm/ADT/StringRef.h"
26#include "llvm/ADT/Twine.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineFunctionAnalysis.h"
29#include "llvm/Function.h"
30#include "llvm/Instructions.h"
31#include "llvm/Module.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/MathExtras.h"
34
35#include <sstream>
36
37#if 0
38STATISTIC(PointerAssignments, "Number of dynamic pointer "
39    "assigments discovered");
40STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
41#endif
42STATISTIC(LocalFuncs, "Number of get_local_size(N) functions removed");
43
44using namespace llvm;
45// The Peephole optimization pass is used to do simple last minute optimizations
46// that are required for correct code or to remove redundant functions
47namespace {
48class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
49public:
50  TargetMachine &TM;
51  static char ID;
52  AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
53  ~AMDILPeepholeOpt();
54  const char *getPassName() const;
55  bool runOnFunction(Function &F);
56  bool doInitialization(Module &M);
57  bool doFinalization(Module &M);
58  void getAnalysisUsage(AnalysisUsage &AU) const;
59protected:
60private:
61  // Function to initiate all of the instruction level optimizations.
62  bool instLevelOptimizations(BasicBlock::iterator *inst);
63  // Quick check to see if we need to dump all of the pointers into the
64  // arena. If this is correct, then we set all pointers to exist in arena. This
65  // is a workaround for aliasing of pointers in a struct/union.
66  bool dumpAllIntoArena(Function &F);
67  // Because I don't want to invalidate any pointers while in the
68  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
69  // it later. This function does the conversions if required.
70  void doAtomicConversionIfNeeded(Function &F);
71  // Because __amdil_is_constant cannot be properly evaluated if
72  // optimizations are disabled, the call's are placed in a vector
73  // and evaluated after the __amdil_image* functions are evaluated
74  // which should allow the __amdil_is_constant function to be
75  // evaluated correctly.
76  void doIsConstCallConversionIfNeeded();
77  bool mChanged;
78  bool mDebug;
79  bool mRWGOpt;
80  bool mConvertAtomics;
81  CodeGenOpt::Level optLevel;
82  // Run a series of tests to see if we can optimize a CALL instruction.
83  bool optimizeCallInst(BasicBlock::iterator *bbb);
84  // A peephole optimization to optimize bit extract sequences.
85  bool optimizeBitExtract(Instruction *inst);
86  // A peephole optimization to optimize bit insert sequences.
87  bool optimizeBitInsert(Instruction *inst);
88  bool setupBitInsert(Instruction *base,
89                      Instruction *&src,
90                      Constant *&mask,
91                      Constant *&shift);
92  // Expand the bit field insert instruction on versions of OpenCL that
93  // don't support it.
94  bool expandBFI(CallInst *CI);
95  // Expand the bit field mask instruction on version of OpenCL that
96  // don't support it.
97  bool expandBFM(CallInst *CI);
98  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
99  // this case we need to expand them. These functions check for 24bit functions
100  // and then expand.
101  bool isSigned24BitOps(CallInst *CI);
102  void expandSigned24BitOps(CallInst *CI);
103  // One optimization that can occur is that if the required workgroup size is
104  // specified then the result of get_local_size is known at compile time and
105  // can be returned accordingly.
106  bool isRWGLocalOpt(CallInst *CI);
107  void expandRWGLocalOpt(CallInst *CI);
108  // On northern island cards, the division is slightly less accurate than on
109  // previous generations, so we need to utilize a more accurate division. So we
110  // can translate the accurate divide to a normal divide on all other cards.
111  bool convertAccurateDivide(CallInst *CI);
112  void expandAccurateDivide(CallInst *CI);
113  // If the alignment is set incorrectly, it can produce really inefficient
114  // code. This checks for this scenario and fixes it if possible.
115  bool correctMisalignedMemOp(Instruction *inst);
116
117  // If we are in no opt mode, then we need to make sure that
118  // local samplers are properly propagated as constant propagation
119  // doesn't occur and we need to know the value of kernel defined
120  // samplers at compile time.
121  bool propagateSamplerInst(CallInst *CI);
122
123  LLVMContext *mCTX;
124  Function *mF;
125  const AMDILSubtarget *mSTM;
126  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
127  SmallVector<CallInst *, 16> isConstVec;
128}; // class AMDILPeepholeOpt
129  char AMDILPeepholeOpt::ID = 0;
130} // anonymous namespace
131
132namespace llvm {
133  FunctionPass *
134  createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
135  {
136    return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
137  }
138} // llvm namespace
139
140AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
141  : FunctionPass(ID), TM(tm)
142{
143  mDebug = DEBUGME;
144  optLevel = TM.getOptLevel();
145
146}
147
148AMDILPeepholeOpt::~AMDILPeepholeOpt()
149{
150}
151
152const char *
153AMDILPeepholeOpt::getPassName() const
154{
155  return "AMDIL PeepHole Optimization Pass";
156}
157
158bool
159containsPointerType(Type *Ty)
160{
161  if (!Ty) {
162    return false;
163  }
164  switch(Ty->getTypeID()) {
165  default:
166    return false;
167  case Type::StructTyID: {
168    const StructType *ST = dyn_cast<StructType>(Ty);
169    for (StructType::element_iterator stb = ST->element_begin(),
170           ste = ST->element_end(); stb != ste; ++stb) {
171      if (!containsPointerType(*stb)) {
172        continue;
173      }
174      return true;
175    }
176    break;
177  }
178  case Type::VectorTyID:
179  case Type::ArrayTyID:
180    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
181  case Type::PointerTyID:
182    return true;
183  };
184  return false;
185}
186
187bool
188AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
189{
190  bool dumpAll = false;
191  for (Function::const_arg_iterator cab = F.arg_begin(),
192       cae = F.arg_end(); cab != cae; ++cab) {
193    const Argument *arg = cab;
194    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
195    if (!PT) {
196      continue;
197    }
198    Type *DereferencedType = PT->getElementType();
199    if (!dyn_cast<StructType>(DereferencedType)
200        ) {
201      continue;
202    }
203    if (!containsPointerType(DereferencedType)) {
204      continue;
205    }
206    // FIXME: Because a pointer inside of a struct/union may be aliased to
207    // another pointer we need to take the conservative approach and place all
208    // pointers into the arena until more advanced detection is implemented.
209    dumpAll = true;
210  }
211  return dumpAll;
212}
213void
214AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
215{
216  if (isConstVec.empty()) {
217    return;
218  }
219  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
220    CallInst *CI = isConstVec[x];
221    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
222    Type *aType = Type::getInt32Ty(*mCTX);
223    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
224      : ConstantInt::get(aType, 0);
225    CI->replaceAllUsesWith(Val);
226    CI->eraseFromParent();
227  }
228  isConstVec.clear();
229}
230void
231AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
232{
233  // Don't do anything if we don't have any atomic operations.
234  if (atomicFuncs.empty()) {
235    return;
236  }
237  // Change the function name for the atomic if it is required
238  uint32_t size = atomicFuncs.size();
239  for (uint32_t x = 0; x < size; ++x) {
240    atomicFuncs[x].first->setOperand(
241        atomicFuncs[x].first->getNumOperands()-1,
242        atomicFuncs[x].second);
243
244  }
245  mChanged = true;
246  if (mConvertAtomics) {
247    return;
248  }
249  // If we did not convert all of the atomics, then we need to make sure that
250  // the atomics that were not converted have their base pointers set to use the
251  // arena path.
252  Function::arg_iterator argB = F.arg_begin();
253  Function::arg_iterator argE = F.arg_end();
254  AMDILKernelManager *KM = mSTM->getKernelManager();
255  AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
256    .getInfo<AMDILMachineFunctionInfo>();
257  for (; argB != argE; ++argB) {
258    if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
259      KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
260      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
261    } else {
262      KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
263      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
264    }
265  }
266}
267
268bool
269AMDILPeepholeOpt::runOnFunction(Function &MF)
270{
271  mChanged = false;
272  mF = &MF;
273  mSTM = &TM.getSubtarget<AMDILSubtarget>();
274  if (mDebug) {
275    MF.dump();
276  }
277  mCTX = &MF.getType()->getContext();
278  mConvertAtomics = true;
279  if (dumpAllIntoArena(MF)) {
280    for (Function::const_arg_iterator cab = MF.arg_begin(),
281         cae = MF.arg_end(); cab != cae; ++cab) {
282      const Argument *arg = cab;
283      AMDILKernelManager *KM = mSTM->getKernelManager();
284      KM->setUAVID(getBasePointerValue(arg),
285          mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
286    }
287  }
288  mRWGOpt = mSTM->getGlobalManager()->hasRWG(MF.getName());
289  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
290     std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
291                  this));
292
293  doAtomicConversionIfNeeded(MF);
294  doIsConstCallConversionIfNeeded();
295
296  if (mDebug) {
297    MF.dump();
298  }
299  return mChanged;
300}
301
302bool
303AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
304{
305  Instruction *inst = (*bbb);
306  CallInst *CI = dyn_cast<CallInst>(inst);
307  if (!CI) {
308    return false;
309  }
310  if (isSigned24BitOps(CI)) {
311    expandSigned24BitOps(CI);
312    ++(*bbb);
313    CI->eraseFromParent();
314    return true;
315  }
316  if (isRWGLocalOpt(CI)) {
317    expandRWGLocalOpt(CI);
318    return false;
319  }
320  if (propagateSamplerInst(CI)) {
321    return false;
322  }
323  if (expandBFI(CI) || expandBFM(CI)) {
324    ++(*bbb);
325    CI->eraseFromParent();
326    return true;
327  }
328  if (convertAccurateDivide(CI)) {
329    expandAccurateDivide(CI);
330    ++(*bbb);
331    CI->eraseFromParent();
332    return true;
333  }
334
335  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
336  if (calleeName.startswith("__amdil_is_constant")) {
337    // If we do not have optimizations, then this
338    // cannot be properly evaluated, so we add the
339    // call instruction to a vector and process
340    // them at the end of processing after the
341    // samplers have been correctly handled.
342    if (optLevel == CodeGenOpt::None) {
343      isConstVec.push_back(CI);
344      return false;
345    } else {
346      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
347      Type *aType = Type::getInt32Ty(*mCTX);
348      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
349        : ConstantInt::get(aType, 0);
350      CI->replaceAllUsesWith(Val);
351      ++(*bbb);
352      CI->eraseFromParent();
353      return true;
354    }
355  }
356
357  if (calleeName.equals("__amdil_is_asic_id_i32")) {
358    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
359    Type *aType = Type::getInt32Ty(*mCTX);
360    Value *Val = CV;
361    if (Val) {
362      Val = ConstantInt::get(aType,
363          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
364    } else {
365      Val = ConstantInt::get(aType, 0);
366    }
367    CI->replaceAllUsesWith(Val);
368    ++(*bbb);
369    CI->eraseFromParent();
370    return true;
371  }
372  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
373  if (!F) {
374    return false;
375  }
376  if (F->getName().startswith("__atom") && !CI->getNumUses()
377      && F->getName().find("_xchg") == StringRef::npos) {
378    std::string buffer(F->getName().str() + "_noret");
379    F = dyn_cast<Function>(
380          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
381    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
382  }
383
384  if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
385      && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
386    return false;
387  }
388  if (!mConvertAtomics) {
389    return false;
390  }
391  StringRef name = F->getName();
392  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
393    Value *ptr = CI->getOperand(0);
394    const Value *basePtr = getBasePointerValue(ptr);
395    const Argument *Arg = dyn_cast<Argument>(basePtr);
396    if (Arg) {
397      AMDILGlobalManager *GM = mSTM->getGlobalManager();
398      int32_t id = GM->getArgID(Arg);
399      if (id >= 0) {
400        std::stringstream ss;
401        ss << name.data() << "_" << id << '\n';
402        std::string val;
403        ss >> val;
404        F = dyn_cast<Function>(
405              F->getParent() ->getOrInsertFunction(val, F->getFunctionType()));
406        atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
407      } else {
408        mConvertAtomics = false;
409      }
410    } else {
411      mConvertAtomics = false;
412    }
413  }
414  return false;
415}
416
417bool
418AMDILPeepholeOpt::setupBitInsert(Instruction *base,
419    Instruction *&src,
420    Constant *&mask,
421    Constant *&shift)
422{
423  if (!base) {
424    if (mDebug) {
425      dbgs() << "Null pointer passed into function.\n";
426    }
427    return false;
428  }
429  bool andOp = false;
430  if (base->getOpcode() == Instruction::Shl) {
431    shift = dyn_cast<Constant>(base->getOperand(1));
432  } else if (base->getOpcode() == Instruction::And) {
433    mask = dyn_cast<Constant>(base->getOperand(1));
434    andOp = true;
435  } else {
436    if (mDebug) {
437      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
438    }
439    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
440    return false;
441  }
442  src = dyn_cast<Instruction>(base->getOperand(0));
443  if (!src) {
444    if (mDebug) {
445      dbgs() << "Failed setup since the base operand is not an instruction!\n";
446    }
447    return false;
448  }
449  // If we find an 'and' operation, then we don't need to
450  // find the next operation as we already know the
451  // bits that are valid at this point.
452  if (andOp) {
453    return true;
454  }
455  if (src->getOpcode() == Instruction::Shl && !shift) {
456    shift = dyn_cast<Constant>(src->getOperand(1));
457    src = dyn_cast<Instruction>(src->getOperand(0));
458  } else if (src->getOpcode() == Instruction::And && !mask) {
459    mask = dyn_cast<Constant>(src->getOperand(1));
460  }
461  if (!mask && !shift) {
462    if (mDebug) {
463      dbgs() << "Failed setup since both mask and shift are NULL!\n";
464    }
465    // Did not find a constant mask or a shift.
466    return false;
467  }
468  return true;
469}
470bool
471AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
472{
473  if (!inst) {
474    return false;
475  }
476  if (!inst->isBinaryOp()) {
477    return false;
478  }
479  if (inst->getOpcode() != Instruction::Or) {
480    return false;
481  }
482  if (optLevel == CodeGenOpt::None) {
483    return false;
484  }
485  // We want to do an optimization on a sequence of ops that in the end equals a
486  // single ISA instruction.
487  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
488  // Some simplified versions of this pattern are as follows:
489  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
490  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
491  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
492  // (A & B) | (D << F) when (1 << F) >= B
493  // (A << C) | (D & E) when (1 << C) >= E
494  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
495    // The HD4XXX hardware doesn't support the ubit_insert instruction.
496    return false;
497  }
498  Type *aType = inst->getType();
499  bool isVector = aType->isVectorTy();
500  int numEle = 1;
501  // This optimization only works on 32bit integers.
502  if (aType->getScalarType()
503      != Type::getInt32Ty(inst->getContext())) {
504    return false;
505  }
506  if (isVector) {
507    const VectorType *VT = dyn_cast<VectorType>(aType);
508    numEle = VT->getNumElements();
509    // We currently cannot support more than 4 elements in a intrinsic and we
510    // cannot support Vec3 types.
511    if (numEle > 4 || numEle == 3) {
512      return false;
513    }
514  }
515  // TODO: Handle vectors.
516  if (isVector) {
517    if (mDebug) {
518      dbgs() << "!!! Vectors are not supported yet!\n";
519    }
520    return false;
521  }
522  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
523  Constant *LHSMask = NULL, *RHSMask = NULL;
524  Constant *LHSShift = NULL, *RHSShift = NULL;
525  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
526  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
527  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
528    if (mDebug) {
529      dbgs() << "Found an OR Operation that failed setup!\n";
530      inst->dump();
531      if (LHS) { LHS->dump(); }
532      if (LHSSrc) { LHSSrc->dump(); }
533      if (LHSMask) { LHSMask->dump(); }
534      if (LHSShift) { LHSShift->dump(); }
535    }
536    // There was an issue with the setup for BitInsert.
537    return false;
538  }
539  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
540    if (mDebug) {
541      dbgs() << "Found an OR Operation that failed setup!\n";
542      inst->dump();
543      if (RHS) { RHS->dump(); }
544      if (RHSSrc) { RHSSrc->dump(); }
545      if (RHSMask) { RHSMask->dump(); }
546      if (RHSShift) { RHSShift->dump(); }
547    }
548    // There was an issue with the setup for BitInsert.
549    return false;
550  }
551  if (mDebug) {
552    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
553    dbgs() << "Op:        "; inst->dump();
554    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
555    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
556    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
557    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
558    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
559    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
560    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
561    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
562  }
563  Constant *offset = NULL;
564  Constant *width = NULL;
565  int32_t lhsMaskVal = 0, rhsMaskVal = 0;
566  int32_t lhsShiftVal = 0, rhsShiftVal = 0;
567  int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
568  int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
569  lhsMaskVal = (int32_t)(LHSMask
570      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
571  rhsMaskVal = (int32_t)(RHSMask
572      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
573  lhsShiftVal = (int32_t)(LHSShift
574      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
575  rhsShiftVal = (int32_t)(RHSShift
576      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
577  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
578  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
579  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
580  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
581  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
582  if (mDebug) {
583      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
584      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
585      dbgs() << (RHSMask ? " & E)" : ")");
586      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
587      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
588      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
589      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
590      dbgs() << "width(B) = " << lhsMaskWidth;
591      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
592      dbgs() << "offset(B) = " << lhsMaskOffset;
593      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
594      dbgs() << "Constraints: \n";
595      dbgs() << "\t(1) B ^ E == 0\n";
596      dbgs() << "\t(2-LHS) B is a mask\n";
597      dbgs() << "\t(2-LHS) E is a mask\n";
598      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
599      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
600  }
601  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
602    if (mDebug) {
603      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
604      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
605      dbgs() << "Failed constraint 1!\n";
606    }
607    return false;
608  }
609  if (mDebug) {
610    dbgs() << "LHS = " << lhsMaskOffset << "";
611    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
612    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
613    dbgs() << "\nRHS = " << rhsMaskOffset << "";
614    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
615    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
616    dbgs() << "\n";
617  }
618  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
619    offset = ConstantInt::get(aType, lhsMaskOffset, false);
620    width = ConstantInt::get(aType, lhsMaskWidth, false);
621    RHSSrc = RHS;
622    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
623      if (mDebug) {
624        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
625        dbgs() << "Failed constraint 2!\n";
626      }
627      return false;
628    }
629    if (!LHSShift) {
630      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
631          "MaskShr", LHS);
632    } else if (lhsShiftVal != lhsMaskOffset) {
633      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
634          "MaskShr", LHS);
635    }
636    if (mDebug) {
637      dbgs() << "Optimizing LHS!\n";
638    }
639  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
640    offset = ConstantInt::get(aType, rhsMaskOffset, false);
641    width = ConstantInt::get(aType, rhsMaskWidth, false);
642    LHSSrc = RHSSrc;
643    RHSSrc = LHS;
644    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
645      if (mDebug) {
646        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
647        dbgs() << "Failed constraint 2!\n";
648      }
649      return false;
650    }
651    if (!RHSShift) {
652      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
653          "MaskShr", RHS);
654    } else if (rhsShiftVal != rhsMaskOffset) {
655      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
656          "MaskShr", RHS);
657    }
658    if (mDebug) {
659      dbgs() << "Optimizing RHS!\n";
660    }
661  } else {
662    if (mDebug) {
663      dbgs() << "Failed constraint 3!\n";
664    }
665    return false;
666  }
667  if (mDebug) {
668    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
669    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
670    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
671    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
672  }
673  if (!offset || !width) {
674    if (mDebug) {
675      dbgs() << "Either width or offset are NULL, failed detection!\n";
676    }
677    return false;
678  }
679  // Lets create the function signature.
680  std::vector<Type *> callTypes;
681  callTypes.push_back(aType);
682  callTypes.push_back(aType);
683  callTypes.push_back(aType);
684  callTypes.push_back(aType);
685  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
686  std::string name = "__amdil_ubit_insert";
687  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
688  Function *Func =
689    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
690        getOrInsertFunction(llvm::StringRef(name), funcType));
691  Value *Operands[4] = {
692    width,
693    offset,
694    LHSSrc,
695    RHSSrc
696  };
697  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
698  if (mDebug) {
699    dbgs() << "Old Inst: ";
700    inst->dump();
701    dbgs() << "New Inst: ";
702    CI->dump();
703    dbgs() << "\n\n";
704  }
705  CI->insertBefore(inst);
706  inst->replaceAllUsesWith(CI);
707  return true;
708}
709
710bool
711AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
712{
713  if (!inst) {
714    return false;
715  }
716  if (!inst->isBinaryOp()) {
717    return false;
718  }
719  if (inst->getOpcode() != Instruction::And) {
720    return false;
721  }
722  if (optLevel == CodeGenOpt::None) {
723    return false;
724  }
725  // We want to do some simple optimizations on Shift right/And patterns. The
726  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
727  // value smaller than 32 and C is a mask. If C is a constant value, then the
728  // following transformation can occur. For signed integers, it turns into the
729  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
730  // integers, it turns into the function call dst =
731  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
732  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
733  // Evergreen hardware.
734  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
735    // This does not work on HD4XXX hardware.
736    return false;
737  }
738  Type *aType = inst->getType();
739  bool isVector = aType->isVectorTy();
740  int numEle = 1;
741  // This only works on 32bit integers
742  if (aType->getScalarType()
743      != Type::getInt32Ty(inst->getContext())) {
744    return false;
745  }
746  if (isVector) {
747    const VectorType *VT = dyn_cast<VectorType>(aType);
748    numEle = VT->getNumElements();
749    // We currently cannot support more than 4 elements in a intrinsic and we
750    // cannot support Vec3 types.
751    if (numEle > 4 || numEle == 3) {
752      return false;
753    }
754  }
755  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
756  // If the first operand is not a shift instruction, then we can return as it
757  // doesn't match this pattern.
758  if (!ShiftInst || !ShiftInst->isShift()) {
759    return false;
760  }
761  // If we are a shift left, then we need don't match this pattern.
762  if (ShiftInst->getOpcode() == Instruction::Shl) {
763    return false;
764  }
765  bool isSigned = ShiftInst->isArithmeticShift();
766  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
767  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
768  // Lets make sure that the shift value and the and mask are constant integers.
769  if (!AndMask || !ShrVal) {
770    return false;
771  }
772  Constant *newMaskConst;
773  Constant *shiftValConst;
774  if (isVector) {
775    // Handle the vector case
776    std::vector<Constant *> maskVals;
777    std::vector<Constant *> shiftVals;
778    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
779    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
780    Type *scalarType = AndMaskVec->getType()->getScalarType();
781    assert(AndMaskVec->getNumOperands() ==
782           ShrValVec->getNumOperands() && "cannot have a "
783           "combination where the number of elements to a "
784           "shift and an and are different!");
785    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
786      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
787      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
788      if (!AndCI || !ShiftIC) {
789        return false;
790      }
791      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
792      if (!isMask_32(maskVal)) {
793        return false;
794      }
795      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
796      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
797      // If the mask or shiftval is greater than the bitcount, then break out.
798      if (maskVal >= 32 || shiftVal >= 32) {
799        return false;
800      }
801      // If the mask val is greater than the the number of original bits left
802      // then this optimization is invalid.
803      if (maskVal > (32 - shiftVal)) {
804        return false;
805      }
806      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
807      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
808    }
809    newMaskConst = ConstantVector::get(maskVals);
810    shiftValConst = ConstantVector::get(shiftVals);
811  } else {
812    // Handle the scalar case
813    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
814    // This must be a mask value where all lower bits are set to 1 and then any
815    // bit higher is set to 0.
816    if (!isMask_32(maskVal)) {
817      return false;
818    }
819    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
820    // Count the number of bits set in the mask, this is the width of the
821    // resulting bit set that is extracted from the source value.
822    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
823    // If the mask or shift val is greater than the bitcount, then break out.
824    if (maskVal >= 32 || shiftVal >= 32) {
825      return false;
826    }
827    // If the mask val is greater than the the number of original bits left then
828    // this optimization is invalid.
829    if (maskVal > (32 - shiftVal)) {
830      return false;
831    }
832    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
833    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
834  }
835  // Lets create the function signature.
836  std::vector<Type *> callTypes;
837  callTypes.push_back(aType);
838  callTypes.push_back(aType);
839  callTypes.push_back(aType);
840  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
841  std::string name = "__amdil_ubit_extract";
842  if (isVector) {
843    name += "_v" + itostr(numEle) + "i32";
844  } else {
845    name += "_i32";
846  }
847  // Lets create the function.
848  Function *Func =
849    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
850                       getOrInsertFunction(llvm::StringRef(name), funcType));
851  Value *Operands[3] = {
852    newMaskConst,
853    shiftValConst,
854    ShiftInst->getOperand(0)
855  };
856  // Lets create the Call with the operands
857  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
858  CI->insertBefore(inst);
859  inst->replaceAllUsesWith(CI);
860  return true;
861}
862
863bool
864AMDILPeepholeOpt::expandBFI(CallInst *CI)
865{
866  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
867    return false;
868  }
869  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
870  if (!LHS->getName().startswith("__amdil_bfi")) {
871    return false;
872  }
873  Type* type = CI->getOperand(0)->getType();
874  Constant *negOneConst = NULL;
875  if (type->isVectorTy()) {
876    std::vector<Constant *> negOneVals;
877    negOneConst = ConstantInt::get(CI->getContext(),
878        APInt(32, StringRef("-1"), 10));
879    for (size_t x = 0,
880        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
881      negOneVals.push_back(negOneConst);
882    }
883    negOneConst = ConstantVector::get(negOneVals);
884  } else {
885    negOneConst = ConstantInt::get(CI->getContext(),
886        APInt(32, StringRef("-1"), 10));
887  }
888  // __amdil_bfi => (A & B) | (~A & C)
889  BinaryOperator *lhs =
890    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
891        CI->getOperand(1), "bfi_and", CI);
892  BinaryOperator *rhs =
893    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
894        "bfi_not", CI);
895  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
896      "bfi_and", CI);
897  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
898  CI->replaceAllUsesWith(lhs);
899  return true;
900}
901
902bool
903AMDILPeepholeOpt::expandBFM(CallInst *CI)
904{
905  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
906    return false;
907  }
908  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
909  if (!LHS->getName().startswith("__amdil_bfm")) {
910    return false;
911  }
912  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
913  Constant *newMaskConst = NULL;
914  Constant *newShiftConst = NULL;
915  Type* type = CI->getOperand(0)->getType();
916  if (type->isVectorTy()) {
917    std::vector<Constant*> newMaskVals, newShiftVals;
918    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
919    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
920    for (size_t x = 0,
921        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
922      newMaskVals.push_back(newMaskConst);
923      newShiftVals.push_back(newShiftConst);
924    }
925    newMaskConst = ConstantVector::get(newMaskVals);
926    newShiftConst = ConstantVector::get(newShiftVals);
927  } else {
928    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
929    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
930  }
931  BinaryOperator *lhs =
932    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
933        newMaskConst, "bfm_mask", CI);
934  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
935      lhs, "bfm_shl", CI);
936  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
937      newShiftConst, "bfm_sub", CI);
938  BinaryOperator *rhs =
939    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
940        newMaskConst, "bfm_mask", CI);
941  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
942  CI->replaceAllUsesWith(lhs);
943  return true;
944}
945
946bool
947AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
948{
949  Instruction *inst = (*bbb);
950  if (optimizeCallInst(bbb)) {
951    return true;
952  }
953  if (optimizeBitExtract(inst)) {
954    return false;
955  }
956  if (optimizeBitInsert(inst)) {
957    return false;
958  }
959  if (correctMisalignedMemOp(inst)) {
960    return false;
961  }
962  return false;
963}
964bool
965AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
966{
967  LoadInst *linst = dyn_cast<LoadInst>(inst);
968  StoreInst *sinst = dyn_cast<StoreInst>(inst);
969  unsigned alignment;
970  Type* Ty = inst->getType();
971  if (linst) {
972    alignment = linst->getAlignment();
973    Ty = inst->getType();
974  } else if (sinst) {
975    alignment = sinst->getAlignment();
976    Ty = sinst->getValueOperand()->getType();
977  } else {
978    return false;
979  }
980  unsigned size = getTypeSize(Ty);
981  if (size == alignment || size < alignment) {
982    return false;
983  }
984  if (!Ty->isStructTy()) {
985    return false;
986  }
987  if (alignment < 4) {
988    if (linst) {
989      linst->setAlignment(0);
990      return true;
991    } else if (sinst) {
992      sinst->setAlignment(0);
993      return true;
994    }
995  }
996  return false;
997}
998bool
999AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
1000{
1001  if (!CI) {
1002    return false;
1003  }
1004  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
1005  std::string namePrefix = LHS->getName().substr(0, 14);
1006  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
1007      && namePrefix != "__amdil__imul24_high") {
1008    return false;
1009  }
1010  if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
1011    return false;
1012  }
1013  return true;
1014}
1015
1016void
1017AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
1018{
1019  assert(isSigned24BitOps(CI) && "Must be a "
1020      "signed 24 bit operation to call this function!");
1021  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1022  // On 7XX and 8XX we do not have signed 24bit, so we need to
1023  // expand it to the following:
1024  // imul24 turns into 32bit imul
1025  // imad24 turns into 32bit imad
1026  // imul24_high turns into 32bit imulhigh
1027  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1028    Type *aType = CI->getOperand(0)->getType();
1029    bool isVector = aType->isVectorTy();
1030    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1031    std::vector<Type*> callTypes;
1032    callTypes.push_back(CI->getOperand(0)->getType());
1033    callTypes.push_back(CI->getOperand(1)->getType());
1034    callTypes.push_back(CI->getOperand(2)->getType());
1035    FunctionType *funcType =
1036      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1037    std::string name = "__amdil_imad";
1038    if (isVector) {
1039      name += "_v" + itostr(numEle) + "i32";
1040    } else {
1041      name += "_i32";
1042    }
1043    Function *Func = dyn_cast<Function>(
1044                       CI->getParent()->getParent()->getParent()->
1045                       getOrInsertFunction(llvm::StringRef(name), funcType));
1046    Value *Operands[3] = {
1047      CI->getOperand(0),
1048      CI->getOperand(1),
1049      CI->getOperand(2)
1050    };
1051    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1052    nCI->insertBefore(CI);
1053    CI->replaceAllUsesWith(nCI);
1054  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1055    BinaryOperator *mulOp =
1056      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1057          CI->getOperand(1), "imul24", CI);
1058    CI->replaceAllUsesWith(mulOp);
1059  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1060    Type *aType = CI->getOperand(0)->getType();
1061
1062    bool isVector = aType->isVectorTy();
1063    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1064    std::vector<Type*> callTypes;
1065    callTypes.push_back(CI->getOperand(0)->getType());
1066    callTypes.push_back(CI->getOperand(1)->getType());
1067    FunctionType *funcType =
1068      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1069    std::string name = "__amdil_imul_high";
1070    if (isVector) {
1071      name += "_v" + itostr(numEle) + "i32";
1072    } else {
1073      name += "_i32";
1074    }
1075    Function *Func = dyn_cast<Function>(
1076                       CI->getParent()->getParent()->getParent()->
1077                       getOrInsertFunction(llvm::StringRef(name), funcType));
1078    Value *Operands[2] = {
1079      CI->getOperand(0),
1080      CI->getOperand(1)
1081    };
1082    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1083    nCI->insertBefore(CI);
1084    CI->replaceAllUsesWith(nCI);
1085  }
1086}
1087
1088bool
1089AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1090{
1091  return (CI != NULL && mRWGOpt
1092          && CI->getOperand(CI->getNumOperands() - 1)->getName()
1093          == "__amdil_get_local_size_int");
1094}
1095
1096void
1097AMDILPeepholeOpt::expandRWGLocalOpt(CallInst *CI)
1098{
1099  assert(isRWGLocalOpt(CI) &&
1100         "This optmization only works when the call inst is get_local_size!");
1101  std::vector<Constant *> consts;
1102  for (uint32_t x = 0; x < 3; ++x) {
1103    uint32_t val = mSTM->getGlobalManager()->getLocal(mF->getName(), x);
1104    consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), val));
1105  }
1106  consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), 0));
1107  Value *cVec = ConstantVector::get(consts);
1108  CI->replaceAllUsesWith(cVec);
1109  ++LocalFuncs;
1110  return;
1111}
1112
1113bool
1114AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
1115{
1116  if (!CI) {
1117    return false;
1118  }
1119  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1120      && (mSTM->getDeviceName() == "cayman")) {
1121    return false;
1122  }
1123  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1124      == "__amdil_improved_div";
1125}
1126
1127void
1128AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
1129{
1130  assert(convertAccurateDivide(CI)
1131         && "expanding accurate divide can only happen if it is expandable!");
1132  BinaryOperator *divOp =
1133    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1134                           CI->getOperand(1), "fdiv32", CI);
1135  CI->replaceAllUsesWith(divOp);
1136}
1137
1138bool
1139AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
1140{
1141  if (optLevel != CodeGenOpt::None) {
1142    return false;
1143  }
1144
1145  if (!CI) {
1146    return false;
1147  }
1148
1149  unsigned funcNameIdx = 0;
1150  funcNameIdx = CI->getNumOperands() - 1;
1151  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1152  if (calleeName != "__amdil_image2d_read_norm"
1153   && calleeName != "__amdil_image2d_read_unnorm"
1154   && calleeName != "__amdil_image3d_read_norm"
1155   && calleeName != "__amdil_image3d_read_unnorm") {
1156    return false;
1157  }
1158
1159  unsigned samplerIdx = 2;
1160  samplerIdx = 1;
1161  Value *sampler = CI->getOperand(samplerIdx);
1162  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1163  if (!lInst) {
1164    return false;
1165  }
1166
1167  if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
1168    return false;
1169  }
1170
1171  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1172  // If we are loading from what is not a global value, then we
1173  // fail and return.
1174  if (!gv) {
1175    return false;
1176  }
1177
1178  // If we don't have an initializer or we have an initializer and
1179  // the initializer is not a 32bit integer, we fail.
1180  if (!gv->hasInitializer()
1181      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1182      return false;
1183  }
1184
1185  // Now that we have the global variable initializer, lets replace
1186  // all uses of the load instruction with the samplerVal and
1187  // reparse the __amdil_is_constant() function.
1188  Constant *samplerVal = gv->getInitializer();
1189  lInst->replaceAllUsesWith(samplerVal);
1190  return true;
1191}
1192
1193bool
1194AMDILPeepholeOpt::doInitialization(Module &M)
1195{
1196  return false;
1197}
1198
1199bool
1200AMDILPeepholeOpt::doFinalization(Module &M)
1201{
1202  return false;
1203}
1204
1205void
1206AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1207{
1208  AU.addRequired<MachineFunctionAnalysis>();
1209  FunctionPass::getAnalysisUsage(AU);
1210  AU.setPreservesAll();
1211}
1212