ScalarReplAggregates.cpp revision c5c43b958cf2f251c836f94d4499adb6296f0611
1//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This transformation implements the well known scalar replacement of
11// aggregates transformation.  This xform breaks up alloca instructions of
12// aggregate type (structure or array) into individual alloca instructions for
13// each member (if possible).  Then, if possible, it transforms the individual
14// alloca instructions into nice clean scalar SSA form.
15//
16// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
17// often interact, especially for C++ programs.  As such, iterating between
18// SRoA, then Mem2Reg until we run out of things to promote works well.
19//
20//===----------------------------------------------------------------------===//
21
22#define DEBUG_TYPE "scalarrepl"
23#include "llvm/Transforms/Scalar.h"
24#include "llvm/Constants.h"
25#include "llvm/DerivedTypes.h"
26#include "llvm/Function.h"
27#include "llvm/GlobalVariable.h"
28#include "llvm/Instructions.h"
29#include "llvm/IntrinsicInst.h"
30#include "llvm/LLVMContext.h"
31#include "llvm/Module.h"
32#include "llvm/Pass.h"
33#include "llvm/Analysis/Dominators.h"
34#include "llvm/Analysis/Loads.h"
35#include "llvm/Analysis/ValueTracking.h"
36#include "llvm/Target/TargetData.h"
37#include "llvm/Transforms/Utils/PromoteMemToReg.h"
38#include "llvm/Transforms/Utils/Local.h"
39#include "llvm/Transforms/Utils/SSAUpdater.h"
40#include "llvm/Support/CallSite.h"
41#include "llvm/Support/Debug.h"
42#include "llvm/Support/ErrorHandling.h"
43#include "llvm/Support/GetElementPtrTypeIterator.h"
44#include "llvm/Support/IRBuilder.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Support/raw_ostream.h"
47#include "llvm/ADT/SetVector.h"
48#include "llvm/ADT/SmallVector.h"
49#include "llvm/ADT/Statistic.h"
50using namespace llvm;
51
52STATISTIC(NumReplaced,  "Number of allocas broken up");
53STATISTIC(NumPromoted,  "Number of allocas promoted");
54STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
55STATISTIC(NumConverted, "Number of aggregates converted to scalar");
56STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
57
58namespace {
59  struct SROA : public FunctionPass {
60    SROA(int T, bool hasDT, char &ID)
61      : FunctionPass(ID), HasDomTree(hasDT) {
62      if (T == -1)
63        SRThreshold = 128;
64      else
65        SRThreshold = T;
66    }
67
68    bool runOnFunction(Function &F);
69
70    bool performScalarRepl(Function &F);
71    bool performPromotion(Function &F);
72
73  private:
74    bool HasDomTree;
75    TargetData *TD;
76
77    /// DeadInsts - Keep track of instructions we have made dead, so that
78    /// we can remove them after we are done working.
79    SmallVector<Value*, 32> DeadInsts;
80
81    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
82    /// information about the uses.  All these fields are initialized to false
83    /// and set to true when something is learned.
84    struct AllocaInfo {
85      /// The alloca to promote.
86      AllocaInst *AI;
87
88      /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
89      /// looping and avoid redundant work.
90      SmallPtrSet<PHINode*, 8> CheckedPHIs;
91
92      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
93      bool isUnsafe : 1;
94
95      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
96      bool isMemCpySrc : 1;
97
98      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
99      bool isMemCpyDst : 1;
100
101      /// hasSubelementAccess - This is true if a subelement of the alloca is
102      /// ever accessed, or false if the alloca is only accessed with mem
103      /// intrinsics or load/store that only access the entire alloca at once.
104      bool hasSubelementAccess : 1;
105
106      /// hasALoadOrStore - This is true if there are any loads or stores to it.
107      /// The alloca may just be accessed with memcpy, for example, which would
108      /// not set this.
109      bool hasALoadOrStore : 1;
110
111      explicit AllocaInfo(AllocaInst *ai)
112        : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
113          hasSubelementAccess(false), hasALoadOrStore(false) {}
114    };
115
116    unsigned SRThreshold;
117
118    void MarkUnsafe(AllocaInfo &I, Instruction *User) {
119      I.isUnsafe = true;
120      DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
121    }
122
123    bool isSafeAllocaToScalarRepl(AllocaInst *AI);
124
125    void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info);
126    void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset,
127                                         AllocaInfo &Info);
128    void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
129    void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
130                         const Type *MemOpType, bool isStore, AllocaInfo &Info,
131                         Instruction *TheAccess, bool AllowWholeAccess);
132    bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size);
133    uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset,
134                                  const Type *&IdxTy);
135
136    void DoScalarReplacement(AllocaInst *AI,
137                             std::vector<AllocaInst*> &WorkList);
138    void DeleteDeadInstructions();
139
140    void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
141                              SmallVector<AllocaInst*, 32> &NewElts);
142    void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
143                        SmallVector<AllocaInst*, 32> &NewElts);
144    void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
145                    SmallVector<AllocaInst*, 32> &NewElts);
146    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
147                                      AllocaInst *AI,
148                                      SmallVector<AllocaInst*, 32> &NewElts);
149    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
150                                       SmallVector<AllocaInst*, 32> &NewElts);
151    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
152                                      SmallVector<AllocaInst*, 32> &NewElts);
153
154    static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI);
155  };
156
157  // SROA_DT - SROA that uses DominatorTree.
158  struct SROA_DT : public SROA {
159    static char ID;
160  public:
161    SROA_DT(int T = -1) : SROA(T, true, ID) {
162      initializeSROA_DTPass(*PassRegistry::getPassRegistry());
163    }
164
165    // getAnalysisUsage - This pass does not require any passes, but we know it
166    // will not alter the CFG, so say so.
167    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
168      AU.addRequired<DominatorTree>();
169      AU.setPreservesCFG();
170    }
171  };
172
173  // SROA_SSAUp - SROA that uses SSAUpdater.
174  struct SROA_SSAUp : public SROA {
175    static char ID;
176  public:
177    SROA_SSAUp(int T = -1) : SROA(T, false, ID) {
178      initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
179    }
180
181    // getAnalysisUsage - This pass does not require any passes, but we know it
182    // will not alter the CFG, so say so.
183    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
184      AU.setPreservesCFG();
185    }
186  };
187
188}
189
190char SROA_DT::ID = 0;
191char SROA_SSAUp::ID = 0;
192
193INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
194                "Scalar Replacement of Aggregates (DT)", false, false)
195INITIALIZE_PASS_DEPENDENCY(DominatorTree)
196INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
197                "Scalar Replacement of Aggregates (DT)", false, false)
198
199INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
200                      "Scalar Replacement of Aggregates (SSAUp)", false, false)
201INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
202                    "Scalar Replacement of Aggregates (SSAUp)", false, false)
203
204// Public interface to the ScalarReplAggregates pass
205FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
206                                                   bool UseDomTree) {
207  if (UseDomTree)
208    return new SROA_DT(Threshold);
209  return new SROA_SSAUp(Threshold);
210}
211
212
213//===----------------------------------------------------------------------===//
214// Convert To Scalar Optimization.
215//===----------------------------------------------------------------------===//
216
217namespace {
218/// ConvertToScalarInfo - This class implements the "Convert To Scalar"
219/// optimization, which scans the uses of an alloca and determines if it can
220/// rewrite it in terms of a single new alloca that can be mem2reg'd.
221class ConvertToScalarInfo {
222  /// AllocaSize - The size of the alloca being considered in bytes.
223  unsigned AllocaSize;
224  const TargetData &TD;
225
226  /// IsNotTrivial - This is set to true if there is some access to the object
227  /// which means that mem2reg can't promote it.
228  bool IsNotTrivial;
229
230  /// VectorTy - This tracks the type that we should promote the vector to if
231  /// it is possible to turn it into a vector.  This starts out null, and if it
232  /// isn't possible to turn into a vector type, it gets set to VoidTy.
233  const Type *VectorTy;
234
235  /// HadAVector - True if there is at least one vector access to the alloca.
236  /// We don't want to turn random arrays into vectors and use vector element
237  /// insert/extract, but if there are element accesses to something that is
238  /// also declared as a vector, we do want to promote to a vector.
239  bool HadAVector;
240
241  /// HadNonMemTransferAccess - True if there is at least one access to the
242  /// alloca that is not a MemTransferInst.  We don't want to turn structs into
243  /// large integers unless there is some potential for optimization.
244  bool HadNonMemTransferAccess;
245
246public:
247  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
248    : AllocaSize(Size), TD(td), IsNotTrivial(false), VectorTy(0),
249      HadAVector(false), HadNonMemTransferAccess(false) { }
250
251  AllocaInst *TryConvert(AllocaInst *AI);
252
253private:
254  bool CanConvertToScalar(Value *V, uint64_t Offset);
255  void MergeInType(const Type *In, uint64_t Offset, bool IsLoadOrStore);
256  bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset);
257  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
258
259  Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
260                                    uint64_t Offset, IRBuilder<> &Builder);
261  Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
262                                   uint64_t Offset, IRBuilder<> &Builder);
263};
264} // end anonymous namespace.
265
266
267/// TryConvert - Analyze the specified alloca, and if it is safe to do so,
268/// rewrite it to be a new alloca which is mem2reg'able.  This returns the new
269/// alloca if possible or null if not.
270AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
271  // If we can't convert this scalar, or if mem2reg can trivially do it, bail
272  // out.
273  if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
274    return 0;
275
276  // If we were able to find a vector type that can handle this with
277  // insert/extract elements, and if there was at least one use that had
278  // a vector type, promote this to a vector.  We don't want to promote
279  // random stuff that doesn't use vectors (e.g. <9 x double>) because then
280  // we just get a lot of insert/extracts.  If at least one vector is
281  // involved, then we probably really do have a union of vector/array.
282  const Type *NewTy;
283  if (VectorTy && VectorTy->isVectorTy() && HadAVector) {
284    DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
285          << *VectorTy << '\n');
286    NewTy = VectorTy;  // Use the vector type.
287  } else {
288    unsigned BitWidth = AllocaSize * 8;
289    if (!HadAVector && !HadNonMemTransferAccess &&
290        !TD.fitsInLegalInteger(BitWidth))
291      return 0;
292
293    DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
294    // Create and insert the integer alloca.
295    NewTy = IntegerType::get(AI->getContext(), BitWidth);
296  }
297  AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
298  ConvertUsesToScalar(AI, NewAI, 0);
299  return NewAI;
300}
301
302/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy)
303/// so far at the offset specified by Offset (which is specified in bytes).
304///
305/// There are three cases we handle here:
306///   1) A union of vector types of the same size and potentially its elements.
307///      Here we turn element accesses into insert/extract element operations.
308///      This promotes a <4 x float> with a store of float to the third element
309///      into a <4 x float> that uses insert element.
310///   2) A union of vector types with power-of-2 size differences, e.g. a float,
311///      <2 x float> and <4 x float>.  Here we turn element accesses into insert
312///      and extract element operations, and <2 x float> accesses into a cast to
313///      <2 x double>, an extract, and a cast back to <2 x float>.
314///   3) A fully general blob of memory, which we turn into some (potentially
315///      large) integer type with extract and insert operations where the loads
316///      and stores would mutate the memory.  We mark this by setting VectorTy
317///      to VoidTy.
318void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset,
319                                      bool IsLoadOrStore) {
320  // If we already decided to turn this into a blob of integer memory, there is
321  // nothing to be done.
322  if (VectorTy && VectorTy->isVoidTy())
323    return;
324
325  // If this could be contributing to a vector, analyze it.
326
327  // If the In type is a vector that is the same size as the alloca, see if it
328  // matches the existing VecTy.
329  if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
330    if (MergeInVectorType(VInTy, Offset))
331      return;
332  } else if (In->isFloatTy() || In->isDoubleTy() ||
333             (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
334              isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
335    // Full width accesses can be ignored, because they can always be turned
336    // into bitcasts.
337    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
338    if (IsLoadOrStore && EltSize == AllocaSize)
339      return;
340
341    // If we're accessing something that could be an element of a vector, see
342    // if the implied vector agrees with what we already have and if Offset is
343    // compatible with it.
344    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0) {
345      if (!VectorTy) {
346        VectorTy = VectorType::get(In, AllocaSize/EltSize);
347        return;
348      }
349
350      unsigned CurrentEltSize = cast<VectorType>(VectorTy)->getElementType()
351                                ->getPrimitiveSizeInBits()/8;
352      if (EltSize == CurrentEltSize)
353        return;
354    }
355  }
356
357  // Otherwise, we have a case that we can't handle with an optimized vector
358  // form.  We can still turn this into a large integer.
359  VectorTy = Type::getVoidTy(In->getContext());
360}
361
362/// MergeInVectorType - Handles the vector case of MergeInType, returning true
363/// if the type was successfully merged and false otherwise.
364bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
365                                            uint64_t Offset) {
366  // Remember if we saw a vector type.
367  HadAVector = true;
368
369  // TODO: Support nonzero offsets?
370  if (Offset != 0)
371    return false;
372
373  // Only allow vectors that are a power-of-2 away from the size of the alloca.
374  if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
375    return false;
376
377  // If this the first vector we see, remember the type so that we know the
378  // element size.
379  if (!VectorTy) {
380    VectorTy = VInTy;
381    return true;
382  }
383
384  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
385  unsigned InBitWidth = VInTy->getBitWidth();
386
387  // Vectors of the same size can be converted using a simple bitcast.
388  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8))
389    return true;
390
391  const Type *ElementTy = cast<VectorType>(VectorTy)->getElementType();
392  const Type *InElementTy = cast<VectorType>(VInTy)->getElementType();
393
394  // Do not allow mixed integer and floating-point accesses from vectors of
395  // different sizes.
396  if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
397    return false;
398
399  if (ElementTy->isFloatingPointTy()) {
400    // Only allow floating-point vectors of different sizes if they have the
401    // same element type.
402    // TODO: This could be loosened a bit, but would anything benefit?
403    if (ElementTy != InElementTy)
404      return false;
405
406    // There are no arbitrary-precision floating-point types, which limits the
407    // number of legal vector types with larger element types that we can form
408    // to bitcast and extract a subvector.
409    // TODO: We could support some more cases with mixed fp128 and double here.
410    if (!(BitWidth == 64 || BitWidth == 128) ||
411        !(InBitWidth == 64 || InBitWidth == 128))
412      return false;
413  } else {
414    assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
415                                       "or floating-point.");
416    unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
417    unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
418
419    // Do not allow integer types smaller than a byte or types whose widths are
420    // not a multiple of a byte.
421    if (BitWidth < 8 || InBitWidth < 8 ||
422        BitWidth % 8 != 0 || InBitWidth % 8 != 0)
423      return false;
424  }
425
426  // Pick the largest of the two vector types.
427  if (InBitWidth > BitWidth)
428    VectorTy = VInTy;
429
430  return true;
431}
432
433/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
434/// its accesses to a single vector type, return true and set VecTy to
435/// the new type.  If we could convert the alloca into a single promotable
436/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
437/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
438/// is the current offset from the base of the alloca being analyzed.
439///
440/// If we see at least one access to the value that is as a vector type, set the
441/// SawVec flag.
442bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
443  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
444    Instruction *User = cast<Instruction>(*UI);
445
446    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
447      // Don't break volatile loads.
448      if (LI->isVolatile())
449        return false;
450      // Don't touch MMX operations.
451      if (LI->getType()->isX86_MMXTy())
452        return false;
453      HadNonMemTransferAccess = true;
454      MergeInType(LI->getType(), Offset, true);
455      continue;
456    }
457
458    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
459      // Storing the pointer, not into the value?
460      if (SI->getOperand(0) == V || SI->isVolatile()) return false;
461      // Don't touch MMX operations.
462      if (SI->getOperand(0)->getType()->isX86_MMXTy())
463        return false;
464      HadNonMemTransferAccess = true;
465      MergeInType(SI->getOperand(0)->getType(), Offset, true);
466      continue;
467    }
468
469    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
470      IsNotTrivial = true;  // Can't be mem2reg'd.
471      if (!CanConvertToScalar(BCI, Offset))
472        return false;
473      continue;
474    }
475
476    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
477      // If this is a GEP with a variable indices, we can't handle it.
478      if (!GEP->hasAllConstantIndices())
479        return false;
480
481      // Compute the offset that this GEP adds to the pointer.
482      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
483      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
484                                               &Indices[0], Indices.size());
485      // See if all uses can be converted.
486      if (!CanConvertToScalar(GEP, Offset+GEPOffset))
487        return false;
488      IsNotTrivial = true;  // Can't be mem2reg'd.
489      HadNonMemTransferAccess = true;
490      continue;
491    }
492
493    // If this is a constant sized memset of a constant value (e.g. 0) we can
494    // handle it.
495    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
496      // Store of constant value and constant size.
497      if (!isa<ConstantInt>(MSI->getValue()) ||
498          !isa<ConstantInt>(MSI->getLength()))
499        return false;
500      IsNotTrivial = true;  // Can't be mem2reg'd.
501      HadNonMemTransferAccess = true;
502      continue;
503    }
504
505    // If this is a memcpy or memmove into or out of the whole allocation, we
506    // can handle it like a load or store of the scalar type.
507    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
508      ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
509      if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
510        return false;
511
512      IsNotTrivial = true;  // Can't be mem2reg'd.
513      continue;
514    }
515
516    // Otherwise, we cannot handle this!
517    return false;
518  }
519
520  return true;
521}
522
523/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
524/// directly.  This happens when we are converting an "integer union" to a
525/// single integer scalar, or when we are converting a "vector union" to a
526/// vector with insert/extractelement instructions.
527///
528/// Offset is an offset from the original alloca, in bits that need to be
529/// shifted to the right.  By the end of this, there should be no uses of Ptr.
530void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
531                                              uint64_t Offset) {
532  while (!Ptr->use_empty()) {
533    Instruction *User = cast<Instruction>(Ptr->use_back());
534
535    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
536      ConvertUsesToScalar(CI, NewAI, Offset);
537      CI->eraseFromParent();
538      continue;
539    }
540
541    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
542      // Compute the offset that this GEP adds to the pointer.
543      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
544      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
545                                               &Indices[0], Indices.size());
546      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
547      GEP->eraseFromParent();
548      continue;
549    }
550
551    IRBuilder<> Builder(User);
552
553    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
554      // The load is a bit extract from NewAI shifted right by Offset bits.
555      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
556      Value *NewLoadVal
557        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
558      LI->replaceAllUsesWith(NewLoadVal);
559      LI->eraseFromParent();
560      continue;
561    }
562
563    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
564      assert(SI->getOperand(0) != Ptr && "Consistency error!");
565      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
566      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
567                                             Builder);
568      Builder.CreateStore(New, NewAI);
569      SI->eraseFromParent();
570
571      // If the load we just inserted is now dead, then the inserted store
572      // overwrote the entire thing.
573      if (Old->use_empty())
574        Old->eraseFromParent();
575      continue;
576    }
577
578    // If this is a constant sized memset of a constant value (e.g. 0) we can
579    // transform it into a store of the expanded constant value.
580    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
581      assert(MSI->getRawDest() == Ptr && "Consistency error!");
582      unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
583      if (NumBytes != 0) {
584        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
585
586        // Compute the value replicated the right number of times.
587        APInt APVal(NumBytes*8, Val);
588
589        // Splat the value if non-zero.
590        if (Val)
591          for (unsigned i = 1; i != NumBytes; ++i)
592            APVal |= APVal << 8;
593
594        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
595        Value *New = ConvertScalar_InsertValue(
596                                    ConstantInt::get(User->getContext(), APVal),
597                                               Old, Offset, Builder);
598        Builder.CreateStore(New, NewAI);
599
600        // If the load we just inserted is now dead, then the memset overwrote
601        // the entire thing.
602        if (Old->use_empty())
603          Old->eraseFromParent();
604      }
605      MSI->eraseFromParent();
606      continue;
607    }
608
609    // If this is a memcpy or memmove into or out of the whole allocation, we
610    // can handle it like a load or store of the scalar type.
611    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
612      assert(Offset == 0 && "must be store to start of alloca");
613
614      // If the source and destination are both to the same alloca, then this is
615      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
616      // as appropriate.
617      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0));
618
619      if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) {
620        // Dest must be OrigAI, change this to be a load from the original
621        // pointer (bitcasted), then a store to our new alloca.
622        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
623        Value *SrcPtr = MTI->getSource();
624        const PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
625        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
626        if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
627          AIPTy = PointerType::get(AIPTy->getElementType(),
628                                   SPTy->getAddressSpace());
629        }
630        SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy);
631
632        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
633        SrcVal->setAlignment(MTI->getAlignment());
634        Builder.CreateStore(SrcVal, NewAI);
635      } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) {
636        // Src must be OrigAI, change this to be a load from NewAI then a store
637        // through the original dest pointer (bitcasted).
638        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
639        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
640
641        const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
642        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
643        if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
644          AIPTy = PointerType::get(AIPTy->getElementType(),
645                                   DPTy->getAddressSpace());
646        }
647        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy);
648
649        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
650        NewStore->setAlignment(MTI->getAlignment());
651      } else {
652        // Noop transfer. Src == Dst
653      }
654
655      MTI->eraseFromParent();
656      continue;
657    }
658
659    llvm_unreachable("Unsupported operation!");
660  }
661}
662
663/// getScaledElementType - Gets a scaled element type for a partial vector
664/// access of an alloca. The input type must be an integer or float, and
665/// the resulting type must be an integer, float or double.
666static const Type *getScaledElementType(const Type *OldTy,
667                                        unsigned NewBitWidth) {
668  assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector "
669         "accesses must be scaled from integer or float elements.");
670
671  LLVMContext &Context = OldTy->getContext();
672
673  if (OldTy->isIntegerTy())
674    return Type::getIntNTy(Context, NewBitWidth);
675  if (NewBitWidth == 32)
676    return Type::getFloatTy(Context);
677  if (NewBitWidth == 64)
678    return Type::getDoubleTy(Context);
679
680  llvm_unreachable("Invalid type for a partial vector access of an alloca!");
681}
682
683/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector
684/// to another vector of the same element type which has the same allocation
685/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>).
686static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
687                                      IRBuilder<> &Builder) {
688  const Type *FromType = FromVal->getType();
689  const VectorType *FromVTy = cast<VectorType>(FromType);
690  const VectorType *ToVTy = cast<VectorType>(ToType);
691  assert((ToVTy->getElementType() == FromVTy->getElementType()) &&
692         "Vectors must have the same element type");
693   Value *UnV = UndefValue::get(FromType);
694   unsigned numEltsFrom = FromVTy->getNumElements();
695   unsigned numEltsTo = ToVTy->getNumElements();
696
697   SmallVector<Constant*, 3> Args;
698   const Type* Int32Ty = Builder.getInt32Ty();
699   unsigned minNumElts = std::min(numEltsFrom, numEltsTo);
700   unsigned i;
701   for (i=0; i != minNumElts; ++i)
702     Args.push_back(ConstantInt::get(Int32Ty, i));
703
704   if (i < numEltsTo) {
705     Constant* UnC = UndefValue::get(Int32Ty);
706     for (; i != numEltsTo; ++i)
707       Args.push_back(UnC);
708   }
709   Constant *Mask = ConstantVector::get(Args);
710   return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV");
711}
712
713/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
714/// or vector value FromVal, extracting the bits from the offset specified by
715/// Offset.  This returns the value, which is of type ToType.
716///
717/// This happens when we are converting an "integer union" to a single
718/// integer scalar, or when we are converting a "vector union" to a vector with
719/// insert/extractelement instructions.
720///
721/// Offset is an offset from the original alloca, in bits that need to be
722/// shifted to the right.
723Value *ConvertToScalarInfo::
724ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
725                           uint64_t Offset, IRBuilder<> &Builder) {
726  // If the load is of the whole new alloca, no conversion is needed.
727  const Type *FromType = FromVal->getType();
728  if (FromType == ToType && Offset == 0)
729    return FromVal;
730
731  // If the result alloca is a vector type, this is either an element
732  // access or a bitcast to another vector type of the same size.
733  if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) {
734    unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
735    if (ToTypeSize == AllocaSize) {
736      // If the two types have the same primitive size, use a bit cast.
737      // Otherwise, it is two vectors with the same element type that has
738      // the same allocation size but different number of elements so use
739      // a shuffle vector.
740      if (FromType->getPrimitiveSizeInBits() ==
741          ToType->getPrimitiveSizeInBits())
742        return Builder.CreateBitCast(FromVal, ToType, "tmp");
743      else
744        return CreateShuffleVectorCast(FromVal, ToType, Builder);
745    }
746
747    if (ToType->isVectorTy()) {
748      assert(isPowerOf2_64(AllocaSize / ToTypeSize) &&
749             "Partial vector access of an alloca must have a power-of-2 size "
750             "ratio.");
751      assert(Offset == 0 && "Can't extract a value of a smaller vector type "
752                            "from a nonzero offset.");
753
754      const Type *ToElementTy = cast<VectorType>(ToType)->getElementType();
755      const Type *CastElementTy = getScaledElementType(ToElementTy,
756                                                       ToTypeSize * 8);
757      unsigned NumCastVectorElements = AllocaSize / ToTypeSize;
758
759      LLVMContext &Context = FromVal->getContext();
760      const Type *CastTy = VectorType::get(CastElementTy,
761                                           NumCastVectorElements);
762      Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
763      Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
764                                        Type::getInt32Ty(Context), 0), "tmp");
765      return Builder.CreateBitCast(Extract, ToType, "tmp");
766    }
767
768    // Otherwise it must be an element access.
769    unsigned Elt = 0;
770    if (Offset) {
771      unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
772      Elt = Offset/EltSize;
773      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
774    }
775    // Return the element extracted out of it.
776    Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get(
777                    Type::getInt32Ty(FromVal->getContext()), Elt), "tmp");
778    if (V->getType() != ToType)
779      V = Builder.CreateBitCast(V, ToType, "tmp");
780    return V;
781  }
782
783  // If ToType is a first class aggregate, extract out each of the pieces and
784  // use insertvalue's to form the FCA.
785  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
786    const StructLayout &Layout = *TD.getStructLayout(ST);
787    Value *Res = UndefValue::get(ST);
788    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
789      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
790                                        Offset+Layout.getElementOffsetInBits(i),
791                                              Builder);
792      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
793    }
794    return Res;
795  }
796
797  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
798    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
799    Value *Res = UndefValue::get(AT);
800    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
801      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
802                                              Offset+i*EltSize, Builder);
803      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
804    }
805    return Res;
806  }
807
808  // Otherwise, this must be a union that was converted to an integer value.
809  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
810
811  // If this is a big-endian system and the load is narrower than the
812  // full alloca type, we need to do a shift to get the right bits.
813  int ShAmt = 0;
814  if (TD.isBigEndian()) {
815    // On big-endian machines, the lowest bit is stored at the bit offset
816    // from the pointer given by getTypeStoreSizeInBits.  This matters for
817    // integers with a bitwidth that is not a multiple of 8.
818    ShAmt = TD.getTypeStoreSizeInBits(NTy) -
819            TD.getTypeStoreSizeInBits(ToType) - Offset;
820  } else {
821    ShAmt = Offset;
822  }
823
824  // Note: we support negative bitwidths (with shl) which are not defined.
825  // We do this to support (f.e.) loads off the end of a structure where
826  // only some bits are used.
827  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
828    FromVal = Builder.CreateLShr(FromVal,
829                                 ConstantInt::get(FromVal->getType(),
830                                                           ShAmt), "tmp");
831  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
832    FromVal = Builder.CreateShl(FromVal,
833                                ConstantInt::get(FromVal->getType(),
834                                                          -ShAmt), "tmp");
835
836  // Finally, unconditionally truncate the integer to the right width.
837  unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
838  if (LIBitWidth < NTy->getBitWidth())
839    FromVal =
840      Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
841                                                    LIBitWidth), "tmp");
842  else if (LIBitWidth > NTy->getBitWidth())
843    FromVal =
844       Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
845                                                    LIBitWidth), "tmp");
846
847  // If the result is an integer, this is a trunc or bitcast.
848  if (ToType->isIntegerTy()) {
849    // Should be done.
850  } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
851    // Just do a bitcast, we know the sizes match up.
852    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
853  } else {
854    // Otherwise must be a pointer.
855    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
856  }
857  assert(FromVal->getType() == ToType && "Didn't convert right?");
858  return FromVal;
859}
860
861/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
862/// or vector value "Old" at the offset specified by Offset.
863///
864/// This happens when we are converting an "integer union" to a
865/// single integer scalar, or when we are converting a "vector union" to a
866/// vector with insert/extractelement instructions.
867///
868/// Offset is an offset from the original alloca, in bits that need to be
869/// shifted to the right.
870Value *ConvertToScalarInfo::
871ConvertScalar_InsertValue(Value *SV, Value *Old,
872                          uint64_t Offset, IRBuilder<> &Builder) {
873  // Convert the stored type to the actual type, shift it left to insert
874  // then 'or' into place.
875  const Type *AllocaType = Old->getType();
876  LLVMContext &Context = Old->getContext();
877
878  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
879    uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
880    uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
881
882    // Changing the whole vector with memset or with an access of a different
883    // vector type?
884    if (ValSize == VecSize) {
885      // If the two types have the same primitive size, use a bit cast.
886      // Otherwise, it is two vectors with the same element type that has
887      // the same allocation size but different number of elements so use
888      // a shuffle vector.
889      if (VTy->getPrimitiveSizeInBits() ==
890          SV->getType()->getPrimitiveSizeInBits())
891        return Builder.CreateBitCast(SV, AllocaType, "tmp");
892      else
893        return CreateShuffleVectorCast(SV, VTy, Builder);
894    }
895
896    if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) {
897      assert(Offset == 0 && "Can't insert a value of a smaller vector type at "
898                            "a nonzero offset.");
899
900      const Type *ToElementTy =
901        cast<VectorType>(SV->getType())->getElementType();
902      const Type *CastElementTy = getScaledElementType(ToElementTy, ValSize);
903      unsigned NumCastVectorElements = VecSize / ValSize;
904
905      LLVMContext &Context = SV->getContext();
906      const Type *OldCastTy = VectorType::get(CastElementTy,
907                                              NumCastVectorElements);
908      Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
909
910      Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
911      Value *Insert =
912        Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
913                                    Type::getInt32Ty(Context), 0), "tmp");
914      return Builder.CreateBitCast(Insert, AllocaType, "tmp");
915    }
916
917    // Must be an element insertion.
918    assert(SV->getType() == VTy->getElementType());
919    uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
920    unsigned Elt = Offset/EltSize;
921    return Builder.CreateInsertElement(Old, SV,
922                     ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
923                                     "tmp");
924  }
925
926  // If SV is a first-class aggregate value, insert each value recursively.
927  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
928    const StructLayout &Layout = *TD.getStructLayout(ST);
929    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
930      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
931      Old = ConvertScalar_InsertValue(Elt, Old,
932                                      Offset+Layout.getElementOffsetInBits(i),
933                                      Builder);
934    }
935    return Old;
936  }
937
938  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
939    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
940    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
941      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
942      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
943    }
944    return Old;
945  }
946
947  // If SV is a float, convert it to the appropriate integer type.
948  // If it is a pointer, do the same.
949  unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType());
950  unsigned DestWidth = TD.getTypeSizeInBits(AllocaType);
951  unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
952  unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
953  if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
954    SV = Builder.CreateBitCast(SV,
955                            IntegerType::get(SV->getContext(),SrcWidth), "tmp");
956  else if (SV->getType()->isPointerTy())
957    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp");
958
959  // Zero extend or truncate the value if needed.
960  if (SV->getType() != AllocaType) {
961    if (SV->getType()->getPrimitiveSizeInBits() <
962             AllocaType->getPrimitiveSizeInBits())
963      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
964    else {
965      // Truncation may be needed if storing more than the alloca can hold
966      // (undefined behavior).
967      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
968      SrcWidth = DestWidth;
969      SrcStoreWidth = DestStoreWidth;
970    }
971  }
972
973  // If this is a big-endian system and the store is narrower than the
974  // full alloca type, we need to do a shift to get the right bits.
975  int ShAmt = 0;
976  if (TD.isBigEndian()) {
977    // On big-endian machines, the lowest bit is stored at the bit offset
978    // from the pointer given by getTypeStoreSizeInBits.  This matters for
979    // integers with a bitwidth that is not a multiple of 8.
980    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
981  } else {
982    ShAmt = Offset;
983  }
984
985  // Note: we support negative bitwidths (with shr) which are not defined.
986  // We do this to support (f.e.) stores off the end of a structure where
987  // only some bits in the structure are set.
988  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
989  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
990    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(),
991                           ShAmt), "tmp");
992    Mask <<= ShAmt;
993  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
994    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(),
995                            -ShAmt), "tmp");
996    Mask = Mask.lshr(-ShAmt);
997  }
998
999  // Mask out the bits we are about to insert from the old value, and or
1000  // in the new bits.
1001  if (SrcWidth != DestWidth) {
1002    assert(DestWidth > SrcWidth);
1003    Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask");
1004    SV = Builder.CreateOr(Old, SV, "ins");
1005  }
1006  return SV;
1007}
1008
1009
1010//===----------------------------------------------------------------------===//
1011// SRoA Driver
1012//===----------------------------------------------------------------------===//
1013
1014
1015bool SROA::runOnFunction(Function &F) {
1016  TD = getAnalysisIfAvailable<TargetData>();
1017
1018  bool Changed = performPromotion(F);
1019
1020  // FIXME: ScalarRepl currently depends on TargetData more than it
1021  // theoretically needs to. It should be refactored in order to support
1022  // target-independent IR. Until this is done, just skip the actual
1023  // scalar-replacement portion of this pass.
1024  if (!TD) return Changed;
1025
1026  while (1) {
1027    bool LocalChange = performScalarRepl(F);
1028    if (!LocalChange) break;   // No need to repromote if no scalarrepl
1029    Changed = true;
1030    LocalChange = performPromotion(F);
1031    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
1032  }
1033
1034  return Changed;
1035}
1036
1037namespace {
1038class AllocaPromoter : public LoadAndStorePromoter {
1039  AllocaInst *AI;
1040public:
1041  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S)
1042    : LoadAndStorePromoter(Insts, S), AI(0) {}
1043
1044  void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
1045    // Remember which alloca we're promoting (for isInstInList).
1046    this->AI = AI;
1047    LoadAndStorePromoter::run(Insts);
1048    AI->eraseFromParent();
1049  }
1050
1051  virtual bool isInstInList(Instruction *I,
1052                            const SmallVectorImpl<Instruction*> &Insts) const {
1053    if (LoadInst *LI = dyn_cast<LoadInst>(I))
1054      return LI->getOperand(0) == AI;
1055    return cast<StoreInst>(I)->getPointerOperand() == AI;
1056  }
1057};
1058} // end anon namespace
1059
1060/// isSafeSelectToSpeculate - Select instructions that use an alloca and are
1061/// subsequently loaded can be rewritten to load both input pointers and then
1062/// select between the result, allowing the load of the alloca to be promoted.
1063/// From this:
1064///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1065///   %V = load i32* %P2
1066/// to:
1067///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1068///   %V2 = load i32* %Other
1069///   %V = select i1 %cond, i32 %V1, i32 %V2
1070///
1071/// We can do this to a select if its only uses are loads and if the operand to
1072/// the select can be loaded unconditionally.
1073static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
1074  bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
1075  bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
1076
1077  for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
1078       UI != UE; ++UI) {
1079    LoadInst *LI = dyn_cast<LoadInst>(*UI);
1080    if (LI == 0 || LI->isVolatile()) return false;
1081
1082    // Both operands to the select need to be dereferencable, either absolutely
1083    // (e.g. allocas) or at this point because we can see other accesses to it.
1084    if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
1085                                                    LI->getAlignment(), TD))
1086      return false;
1087    if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
1088                                                    LI->getAlignment(), TD))
1089      return false;
1090  }
1091
1092  return true;
1093}
1094
1095/// isSafePHIToSpeculate - PHI instructions that use an alloca and are
1096/// subsequently loaded can be rewritten to load both input pointers in the pred
1097/// blocks and then PHI the results, allowing the load of the alloca to be
1098/// promoted.
1099/// From this:
1100///   %P2 = phi [i32* %Alloca, i32* %Other]
1101///   %V = load i32* %P2
1102/// to:
1103///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1104///   ...
1105///   %V2 = load i32* %Other
1106///   ...
1107///   %V = phi [i32 %V1, i32 %V2]
1108///
1109/// We can do this to a select if its only uses are loads and if the operand to
1110/// the select can be loaded unconditionally.
1111static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
1112  // For now, we can only do this promotion if the load is in the same block as
1113  // the PHI, and if there are no stores between the phi and load.
1114  // TODO: Allow recursive phi users.
1115  // TODO: Allow stores.
1116  BasicBlock *BB = PN->getParent();
1117  unsigned MaxAlign = 0;
1118  for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
1119       UI != UE; ++UI) {
1120    LoadInst *LI = dyn_cast<LoadInst>(*UI);
1121    if (LI == 0 || LI->isVolatile()) return false;
1122
1123    // For now we only allow loads in the same block as the PHI.  This is a
1124    // common case that happens when instcombine merges two loads through a PHI.
1125    if (LI->getParent() != BB) return false;
1126
1127    // Ensure that there are no instructions between the PHI and the load that
1128    // could store.
1129    for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI)
1130      if (BBI->mayWriteToMemory())
1131        return false;
1132
1133    MaxAlign = std::max(MaxAlign, LI->getAlignment());
1134  }
1135
1136  // Okay, we know that we have one or more loads in the same block as the PHI.
1137  // We can transform this if it is safe to push the loads into the predecessor
1138  // blocks.  The only thing to watch out for is that we can't put a possibly
1139  // trapping load in the predecessor if it is a critical edge.
1140  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
1141    BasicBlock *Pred = PN->getIncomingBlock(i);
1142
1143    // If the predecessor has a single successor, then the edge isn't critical.
1144    if (Pred->getTerminator()->getNumSuccessors() == 1)
1145      continue;
1146
1147    Value *InVal = PN->getIncomingValue(i);
1148
1149    // If the InVal is an invoke in the pred, we can't put a load on the edge.
1150    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
1151      if (II->getParent() == Pred)
1152        return false;
1153
1154    // If this pointer is always safe to load, or if we can prove that there is
1155    // already a load in the block, then we can move the load to the pred block.
1156    if (InVal->isDereferenceablePointer() ||
1157        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD))
1158      continue;
1159
1160    return false;
1161  }
1162
1163  return true;
1164}
1165
1166
1167/// tryToMakeAllocaBePromotable - This returns true if the alloca only has
1168/// direct (non-volatile) loads and stores to it.  If the alloca is close but
1169/// not quite there, this will transform the code to allow promotion.  As such,
1170/// it is a non-pure predicate.
1171static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
1172  SetVector<Instruction*, SmallVector<Instruction*, 4>,
1173            SmallPtrSet<Instruction*, 4> > InstsToRewrite;
1174
1175  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
1176       UI != UE; ++UI) {
1177    User *U = *UI;
1178    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
1179      if (LI->isVolatile())
1180        return false;
1181      continue;
1182    }
1183
1184    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
1185      if (SI->getOperand(0) == AI || SI->isVolatile())
1186        return false;   // Don't allow a store OF the AI, only INTO the AI.
1187      continue;
1188    }
1189
1190    if (SelectInst *SI = dyn_cast<SelectInst>(U)) {
1191      // If the condition being selected on is a constant, fold the select, yes
1192      // this does (rarely) happen early on.
1193      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) {
1194        Value *Result = SI->getOperand(1+CI->isZero());
1195        SI->replaceAllUsesWith(Result);
1196        SI->eraseFromParent();
1197
1198        // This is very rare and we just scrambled the use list of AI, start
1199        // over completely.
1200        return tryToMakeAllocaBePromotable(AI, TD);
1201      }
1202
1203      // If it is safe to turn "load (select c, AI, ptr)" into a select of two
1204      // loads, then we can transform this by rewriting the select.
1205      if (!isSafeSelectToSpeculate(SI, TD))
1206        return false;
1207
1208      InstsToRewrite.insert(SI);
1209      continue;
1210    }
1211
1212    if (PHINode *PN = dyn_cast<PHINode>(U)) {
1213      if (PN->use_empty()) {  // Dead PHIs can be stripped.
1214        InstsToRewrite.insert(PN);
1215        continue;
1216      }
1217
1218      // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
1219      // in the pred blocks, then we can transform this by rewriting the PHI.
1220      if (!isSafePHIToSpeculate(PN, TD))
1221        return false;
1222
1223      InstsToRewrite.insert(PN);
1224      continue;
1225    }
1226
1227    return false;
1228  }
1229
1230  // If there are no instructions to rewrite, then all uses are load/stores and
1231  // we're done!
1232  if (InstsToRewrite.empty())
1233    return true;
1234
1235  // If we have instructions that need to be rewritten for this to be promotable
1236  // take care of it now.
1237  for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
1238    if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
1239      // Selects in InstsToRewrite only have load uses.  Rewrite each as two
1240      // loads with a new select.
1241      while (!SI->use_empty()) {
1242        LoadInst *LI = cast<LoadInst>(SI->use_back());
1243
1244        IRBuilder<> Builder(LI);
1245        LoadInst *TrueLoad =
1246          Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
1247        LoadInst *FalseLoad =
1248          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".t");
1249
1250        // Transfer alignment and TBAA info if present.
1251        TrueLoad->setAlignment(LI->getAlignment());
1252        FalseLoad->setAlignment(LI->getAlignment());
1253        if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
1254          TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
1255          FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
1256        }
1257
1258        Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
1259        V->takeName(LI);
1260        LI->replaceAllUsesWith(V);
1261        LI->eraseFromParent();
1262      }
1263
1264      // Now that all the loads are gone, the select is gone too.
1265      SI->eraseFromParent();
1266      continue;
1267    }
1268
1269    // Otherwise, we have a PHI node which allows us to push the loads into the
1270    // predecessors.
1271    PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
1272    if (PN->use_empty()) {
1273      PN->eraseFromParent();
1274      continue;
1275    }
1276
1277    const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
1278    PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
1279                                     PN->getName()+".ld", PN);
1280
1281    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
1282    // matter which one we get and if any differ, it doesn't matter.
1283    LoadInst *SomeLoad = cast<LoadInst>(PN->use_back());
1284    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
1285    unsigned Align = SomeLoad->getAlignment();
1286
1287    // Rewrite all loads of the PN to use the new PHI.
1288    while (!PN->use_empty()) {
1289      LoadInst *LI = cast<LoadInst>(PN->use_back());
1290      LI->replaceAllUsesWith(NewPN);
1291      LI->eraseFromParent();
1292    }
1293
1294    // Inject loads into all of the pred blocks.  Keep track of which blocks we
1295    // insert them into in case we have multiple edges from the same block.
1296    DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
1297
1298    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
1299      BasicBlock *Pred = PN->getIncomingBlock(i);
1300      LoadInst *&Load = InsertedLoads[Pred];
1301      if (Load == 0) {
1302        Load = new LoadInst(PN->getIncomingValue(i),
1303                            PN->getName() + "." + Pred->getName(),
1304                            Pred->getTerminator());
1305        Load->setAlignment(Align);
1306        if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
1307      }
1308
1309      NewPN->addIncoming(Load, Pred);
1310    }
1311
1312    PN->eraseFromParent();
1313  }
1314
1315  ++NumAdjusted;
1316  return true;
1317}
1318
1319
1320bool SROA::performPromotion(Function &F) {
1321  std::vector<AllocaInst*> Allocas;
1322  DominatorTree *DT = 0;
1323  if (HasDomTree)
1324    DT = &getAnalysis<DominatorTree>();
1325
1326  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
1327
1328  bool Changed = false;
1329  SmallVector<Instruction*, 64> Insts;
1330  while (1) {
1331    Allocas.clear();
1332
1333    // Find allocas that are safe to promote, by looking at all instructions in
1334    // the entry node
1335    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
1336      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
1337        if (tryToMakeAllocaBePromotable(AI, TD))
1338          Allocas.push_back(AI);
1339
1340    if (Allocas.empty()) break;
1341
1342    if (HasDomTree)
1343      PromoteMemToReg(Allocas, *DT);
1344    else {
1345      SSAUpdater SSA;
1346      for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
1347        AllocaInst *AI = Allocas[i];
1348
1349        // Build list of instructions to promote.
1350        for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
1351             UI != E; ++UI)
1352          Insts.push_back(cast<Instruction>(*UI));
1353
1354        AllocaPromoter(Insts, SSA).run(AI, Insts);
1355        Insts.clear();
1356      }
1357    }
1358    NumPromoted += Allocas.size();
1359    Changed = true;
1360  }
1361
1362  return Changed;
1363}
1364
1365
1366/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
1367/// SROA.  It must be a struct or array type with a small number of elements.
1368static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
1369  const Type *T = AI->getAllocatedType();
1370  // Do not promote any struct into more than 32 separate vars.
1371  if (const StructType *ST = dyn_cast<StructType>(T))
1372    return ST->getNumElements() <= 32;
1373  // Arrays are much less likely to be safe for SROA; only consider
1374  // them if they are very small.
1375  if (const ArrayType *AT = dyn_cast<ArrayType>(T))
1376    return AT->getNumElements() <= 8;
1377  return false;
1378}
1379
1380
1381// performScalarRepl - This algorithm is a simple worklist driven algorithm,
1382// which runs on all of the malloc/alloca instructions in the function, removing
1383// them if they are only used by getelementptr instructions.
1384//
1385bool SROA::performScalarRepl(Function &F) {
1386  std::vector<AllocaInst*> WorkList;
1387
1388  // Scan the entry basic block, adding allocas to the worklist.
1389  BasicBlock &BB = F.getEntryBlock();
1390  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
1391    if (AllocaInst *A = dyn_cast<AllocaInst>(I))
1392      WorkList.push_back(A);
1393
1394  // Process the worklist
1395  bool Changed = false;
1396  while (!WorkList.empty()) {
1397    AllocaInst *AI = WorkList.back();
1398    WorkList.pop_back();
1399
1400    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
1401    // with unused elements.
1402    if (AI->use_empty()) {
1403      AI->eraseFromParent();
1404      Changed = true;
1405      continue;
1406    }
1407
1408    // If this alloca is impossible for us to promote, reject it early.
1409    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
1410      continue;
1411
1412    // Check to see if this allocation is only modified by a memcpy/memmove from
1413    // a constant global.  If this is the case, we can change all users to use
1414    // the constant global instead.  This is commonly produced by the CFE by
1415    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
1416    // is only subsequently read.
1417    if (MemTransferInst *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) {
1418      DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
1419      DEBUG(dbgs() << "  memcpy = " << *TheCopy << '\n');
1420      Constant *TheSrc = cast<Constant>(TheCopy->getSource());
1421      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
1422      TheCopy->eraseFromParent();  // Don't mutate the global.
1423      AI->eraseFromParent();
1424      ++NumGlobals;
1425      Changed = true;
1426      continue;
1427    }
1428
1429    // Check to see if we can perform the core SROA transformation.  We cannot
1430    // transform the allocation instruction if it is an array allocation
1431    // (allocations OF arrays are ok though), and an allocation of a scalar
1432    // value cannot be decomposed at all.
1433    uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType());
1434
1435    // Do not promote [0 x %struct].
1436    if (AllocaSize == 0) continue;
1437
1438    // Do not promote any struct whose size is too big.
1439    if (AllocaSize > SRThreshold) continue;
1440
1441    // If the alloca looks like a good candidate for scalar replacement, and if
1442    // all its users can be transformed, then split up the aggregate into its
1443    // separate elements.
1444    if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) {
1445      DoScalarReplacement(AI, WorkList);
1446      Changed = true;
1447      continue;
1448    }
1449
1450    // If we can turn this aggregate value (potentially with casts) into a
1451    // simple scalar value that can be mem2reg'd into a register value.
1452    // IsNotTrivial tracks whether this is something that mem2reg could have
1453    // promoted itself.  If so, we don't want to transform it needlessly.  Note
1454    // that we can't just check based on the type: the alloca may be of an i32
1455    // but that has pointer arithmetic to set byte 3 of it or something.
1456    if (AllocaInst *NewAI =
1457          ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) {
1458      NewAI->takeName(AI);
1459      AI->eraseFromParent();
1460      ++NumConverted;
1461      Changed = true;
1462      continue;
1463    }
1464
1465    // Otherwise, couldn't process this alloca.
1466  }
1467
1468  return Changed;
1469}
1470
1471/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
1472/// predicate, do SROA now.
1473void SROA::DoScalarReplacement(AllocaInst *AI,
1474                               std::vector<AllocaInst*> &WorkList) {
1475  DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
1476  SmallVector<AllocaInst*, 32> ElementAllocas;
1477  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
1478    ElementAllocas.reserve(ST->getNumContainedTypes());
1479    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
1480      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
1481                                      AI->getAlignment(),
1482                                      AI->getName() + "." + Twine(i), AI);
1483      ElementAllocas.push_back(NA);
1484      WorkList.push_back(NA);  // Add to worklist for recursive processing
1485    }
1486  } else {
1487    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
1488    ElementAllocas.reserve(AT->getNumElements());
1489    const Type *ElTy = AT->getElementType();
1490    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
1491      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
1492                                      AI->getName() + "." + Twine(i), AI);
1493      ElementAllocas.push_back(NA);
1494      WorkList.push_back(NA);  // Add to worklist for recursive processing
1495    }
1496  }
1497
1498  // Now that we have created the new alloca instructions, rewrite all the
1499  // uses of the old alloca.
1500  RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
1501
1502  // Now erase any instructions that were made dead while rewriting the alloca.
1503  DeleteDeadInstructions();
1504  AI->eraseFromParent();
1505
1506  ++NumReplaced;
1507}
1508
1509/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
1510/// recursively including all their operands that become trivially dead.
1511void SROA::DeleteDeadInstructions() {
1512  while (!DeadInsts.empty()) {
1513    Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
1514
1515    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
1516      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
1517        // Zero out the operand and see if it becomes trivially dead.
1518        // (But, don't add allocas to the dead instruction list -- they are
1519        // already on the worklist and will be deleted separately.)
1520        *OI = 0;
1521        if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
1522          DeadInsts.push_back(U);
1523      }
1524
1525    I->eraseFromParent();
1526  }
1527}
1528
1529/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
1530/// performing scalar replacement of alloca AI.  The results are flagged in
1531/// the Info parameter.  Offset indicates the position within AI that is
1532/// referenced by this instruction.
1533void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
1534                               AllocaInfo &Info) {
1535  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
1536    Instruction *User = cast<Instruction>(*UI);
1537
1538    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
1539      isSafeForScalarRepl(BC, Offset, Info);
1540    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
1541      uint64_t GEPOffset = Offset;
1542      isSafeGEP(GEPI, GEPOffset, Info);
1543      if (!Info.isUnsafe)
1544        isSafeForScalarRepl(GEPI, GEPOffset, Info);
1545    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
1546      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
1547      if (Length == 0)
1548        return MarkUnsafe(Info, User);
1549      isSafeMemAccess(Offset, Length->getZExtValue(), 0,
1550                      UI.getOperandNo() == 0, Info, MI,
1551                      true /*AllowWholeAccess*/);
1552    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
1553      if (LI->isVolatile())
1554        return MarkUnsafe(Info, User);
1555      const Type *LIType = LI->getType();
1556      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
1557                      LIType, false, Info, LI, true /*AllowWholeAccess*/);
1558      Info.hasALoadOrStore = true;
1559
1560    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
1561      // Store is ok if storing INTO the pointer, not storing the pointer
1562      if (SI->isVolatile() || SI->getOperand(0) == I)
1563        return MarkUnsafe(Info, User);
1564
1565      const Type *SIType = SI->getOperand(0)->getType();
1566      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
1567                      SIType, true, Info, SI, true /*AllowWholeAccess*/);
1568      Info.hasALoadOrStore = true;
1569    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
1570      isSafePHISelectUseForScalarRepl(User, Offset, Info);
1571    } else {
1572      return MarkUnsafe(Info, User);
1573    }
1574    if (Info.isUnsafe) return;
1575  }
1576}
1577
1578
1579/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
1580/// derived from the alloca, we can often still split the alloca into elements.
1581/// This is useful if we have a large alloca where one element is phi'd
1582/// together somewhere: we can SRoA and promote all the other elements even if
1583/// we end up not being able to promote this one.
1584///
1585/// All we require is that the uses of the PHI do not index into other parts of
1586/// the alloca.  The most important use case for this is single load and stores
1587/// that are PHI'd together, which can happen due to code sinking.
1588void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
1589                                           AllocaInfo &Info) {
1590  // If we've already checked this PHI, don't do it again.
1591  if (PHINode *PN = dyn_cast<PHINode>(I))
1592    if (!Info.CheckedPHIs.insert(PN))
1593      return;
1594
1595  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
1596    Instruction *User = cast<Instruction>(*UI);
1597
1598    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
1599      isSafePHISelectUseForScalarRepl(BC, Offset, Info);
1600    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
1601      // Only allow "bitcast" GEPs for simplicity.  We could generalize this,
1602      // but would have to prove that we're staying inside of an element being
1603      // promoted.
1604      if (!GEPI->hasAllZeroIndices())
1605        return MarkUnsafe(Info, User);
1606      isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
1607    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
1608      if (LI->isVolatile())
1609        return MarkUnsafe(Info, User);
1610      const Type *LIType = LI->getType();
1611      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
1612                      LIType, false, Info, LI, false /*AllowWholeAccess*/);
1613      Info.hasALoadOrStore = true;
1614
1615    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
1616      // Store is ok if storing INTO the pointer, not storing the pointer
1617      if (SI->isVolatile() || SI->getOperand(0) == I)
1618        return MarkUnsafe(Info, User);
1619
1620      const Type *SIType = SI->getOperand(0)->getType();
1621      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
1622                      SIType, true, Info, SI, false /*AllowWholeAccess*/);
1623      Info.hasALoadOrStore = true;
1624    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
1625      isSafePHISelectUseForScalarRepl(User, Offset, Info);
1626    } else {
1627      return MarkUnsafe(Info, User);
1628    }
1629    if (Info.isUnsafe) return;
1630  }
1631}
1632
1633/// isSafeGEP - Check if a GEP instruction can be handled for scalar
1634/// replacement.  It is safe when all the indices are constant, in-bounds
1635/// references, and when the resulting offset corresponds to an element within
1636/// the alloca type.  The results are flagged in the Info parameter.  Upon
1637/// return, Offset is adjusted as specified by the GEP indices.
1638void SROA::isSafeGEP(GetElementPtrInst *GEPI,
1639                     uint64_t &Offset, AllocaInfo &Info) {
1640  gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
1641  if (GEPIt == E)
1642    return;
1643
1644  // Walk through the GEP type indices, checking the types that this indexes
1645  // into.
1646  for (; GEPIt != E; ++GEPIt) {
1647    // Ignore struct elements, no extra checking needed for these.
1648    if ((*GEPIt)->isStructTy())
1649      continue;
1650
1651    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
1652    if (!IdxVal)
1653      return MarkUnsafe(Info, GEPI);
1654  }
1655
1656  // Compute the offset due to this GEP and check if the alloca has a
1657  // component element at that offset.
1658  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
1659  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
1660                                 &Indices[0], Indices.size());
1661  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
1662    MarkUnsafe(Info, GEPI);
1663}
1664
1665/// isHomogeneousAggregate - Check if type T is a struct or array containing
1666/// elements of the same type (which is always true for arrays).  If so,
1667/// return true with NumElts and EltTy set to the number of elements and the
1668/// element type, respectively.
1669static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
1670                                   const Type *&EltTy) {
1671  if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
1672    NumElts = AT->getNumElements();
1673    EltTy = (NumElts == 0 ? 0 : AT->getElementType());
1674    return true;
1675  }
1676  if (const StructType *ST = dyn_cast<StructType>(T)) {
1677    NumElts = ST->getNumContainedTypes();
1678    EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
1679    for (unsigned n = 1; n < NumElts; ++n) {
1680      if (ST->getContainedType(n) != EltTy)
1681        return false;
1682    }
1683    return true;
1684  }
1685  return false;
1686}
1687
1688/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
1689/// "homogeneous" aggregates with the same element type and number of elements.
1690static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
1691  if (T1 == T2)
1692    return true;
1693
1694  unsigned NumElts1, NumElts2;
1695  const Type *EltTy1, *EltTy2;
1696  if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
1697      isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
1698      NumElts1 == NumElts2 &&
1699      EltTy1 == EltTy2)
1700    return true;
1701
1702  return false;
1703}
1704
1705/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
1706/// alloca or has an offset and size that corresponds to a component element
1707/// within it.  The offset checked here may have been formed from a GEP with a
1708/// pointer bitcasted to a different type.
1709///
1710/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
1711/// unit.  If false, it only allows accesses known to be in a single element.
1712void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
1713                           const Type *MemOpType, bool isStore,
1714                           AllocaInfo &Info, Instruction *TheAccess,
1715                           bool AllowWholeAccess) {
1716  // Check if this is a load/store of the entire alloca.
1717  if (Offset == 0 && AllowWholeAccess &&
1718      MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) {
1719    // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
1720    // loads/stores (which are essentially the same as the MemIntrinsics with
1721    // regard to copying padding between elements).  But, if an alloca is
1722    // flagged as both a source and destination of such operations, we'll need
1723    // to check later for padding between elements.
1724    if (!MemOpType || MemOpType->isIntegerTy()) {
1725      if (isStore)
1726        Info.isMemCpyDst = true;
1727      else
1728        Info.isMemCpySrc = true;
1729      return;
1730    }
1731    // This is also safe for references using a type that is compatible with
1732    // the type of the alloca, so that loads/stores can be rewritten using
1733    // insertvalue/extractvalue.
1734    if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) {
1735      Info.hasSubelementAccess = true;
1736      return;
1737    }
1738  }
1739  // Check if the offset/size correspond to a component within the alloca type.
1740  const Type *T = Info.AI->getAllocatedType();
1741  if (TypeHasComponent(T, Offset, MemSize)) {
1742    Info.hasSubelementAccess = true;
1743    return;
1744  }
1745
1746  return MarkUnsafe(Info, TheAccess);
1747}
1748
1749/// TypeHasComponent - Return true if T has a component type with the
1750/// specified offset and size.  If Size is zero, do not check the size.
1751bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) {
1752  const Type *EltTy;
1753  uint64_t EltSize;
1754  if (const StructType *ST = dyn_cast<StructType>(T)) {
1755    const StructLayout *Layout = TD->getStructLayout(ST);
1756    unsigned EltIdx = Layout->getElementContainingOffset(Offset);
1757    EltTy = ST->getContainedType(EltIdx);
1758    EltSize = TD->getTypeAllocSize(EltTy);
1759    Offset -= Layout->getElementOffset(EltIdx);
1760  } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
1761    EltTy = AT->getElementType();
1762    EltSize = TD->getTypeAllocSize(EltTy);
1763    if (Offset >= AT->getNumElements() * EltSize)
1764      return false;
1765    Offset %= EltSize;
1766  } else {
1767    return false;
1768  }
1769  if (Offset == 0 && (Size == 0 || EltSize == Size))
1770    return true;
1771  // Check if the component spans multiple elements.
1772  if (Offset + Size > EltSize)
1773    return false;
1774  return TypeHasComponent(EltTy, Offset, Size);
1775}
1776
1777/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
1778/// the instruction I, which references it, to use the separate elements.
1779/// Offset indicates the position within AI that is referenced by this
1780/// instruction.
1781void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
1782                                SmallVector<AllocaInst*, 32> &NewElts) {
1783  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
1784    Use &TheUse = UI.getUse();
1785    Instruction *User = cast<Instruction>(*UI++);
1786
1787    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
1788      RewriteBitCast(BC, AI, Offset, NewElts);
1789      continue;
1790    }
1791
1792    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
1793      RewriteGEP(GEPI, AI, Offset, NewElts);
1794      continue;
1795    }
1796
1797    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
1798      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
1799      uint64_t MemSize = Length->getZExtValue();
1800      if (Offset == 0 &&
1801          MemSize == TD->getTypeAllocSize(AI->getAllocatedType()))
1802        RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
1803      // Otherwise the intrinsic can only touch a single element and the
1804      // address operand will be updated, so nothing else needs to be done.
1805      continue;
1806    }
1807
1808    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
1809      const Type *LIType = LI->getType();
1810
1811      if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
1812        // Replace:
1813        //   %res = load { i32, i32 }* %alloc
1814        // with:
1815        //   %load.0 = load i32* %alloc.0
1816        //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
1817        //   %load.1 = load i32* %alloc.1
1818        //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
1819        // (Also works for arrays instead of structs)
1820        Value *Insert = UndefValue::get(LIType);
1821        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
1822          Value *Load = new LoadInst(NewElts[i], "load", LI);
1823          Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
1824        }
1825        LI->replaceAllUsesWith(Insert);
1826        DeadInsts.push_back(LI);
1827      } else if (LIType->isIntegerTy() &&
1828                 TD->getTypeAllocSize(LIType) ==
1829                 TD->getTypeAllocSize(AI->getAllocatedType())) {
1830        // If this is a load of the entire alloca to an integer, rewrite it.
1831        RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
1832      }
1833      continue;
1834    }
1835
1836    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
1837      Value *Val = SI->getOperand(0);
1838      const Type *SIType = Val->getType();
1839      if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
1840        // Replace:
1841        //   store { i32, i32 } %val, { i32, i32 }* %alloc
1842        // with:
1843        //   %val.0 = extractvalue { i32, i32 } %val, 0
1844        //   store i32 %val.0, i32* %alloc.0
1845        //   %val.1 = extractvalue { i32, i32 } %val, 1
1846        //   store i32 %val.1, i32* %alloc.1
1847        // (Also works for arrays instead of structs)
1848        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
1849          Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
1850          new StoreInst(Extract, NewElts[i], SI);
1851        }
1852        DeadInsts.push_back(SI);
1853      } else if (SIType->isIntegerTy() &&
1854                 TD->getTypeAllocSize(SIType) ==
1855                 TD->getTypeAllocSize(AI->getAllocatedType())) {
1856        // If this is a store of the entire alloca from an integer, rewrite it.
1857        RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
1858      }
1859      continue;
1860    }
1861
1862    if (isa<SelectInst>(User) || isa<PHINode>(User)) {
1863      // If we have a PHI user of the alloca itself (as opposed to a GEP or
1864      // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
1865      // the new pointer.
1866      if (!isa<AllocaInst>(I)) continue;
1867
1868      assert(Offset == 0 && NewElts[0] &&
1869             "Direct alloca use should have a zero offset");
1870
1871      // If we have a use of the alloca, we know the derived uses will be
1872      // utilizing just the first element of the scalarized result.  Insert a
1873      // bitcast of the first alloca before the user as required.
1874      AllocaInst *NewAI = NewElts[0];
1875      BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI);
1876      NewAI->moveBefore(BCI);
1877      TheUse = BCI;
1878      continue;
1879    }
1880  }
1881}
1882
1883/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
1884/// and recursively continue updating all of its uses.
1885void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
1886                          SmallVector<AllocaInst*, 32> &NewElts) {
1887  RewriteForScalarRepl(BC, AI, Offset, NewElts);
1888  if (BC->getOperand(0) != AI)
1889    return;
1890
1891  // The bitcast references the original alloca.  Replace its uses with
1892  // references to the first new element alloca.
1893  Instruction *Val = NewElts[0];
1894  if (Val->getType() != BC->getDestTy()) {
1895    Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
1896    Val->takeName(BC);
1897  }
1898  BC->replaceAllUsesWith(Val);
1899  DeadInsts.push_back(BC);
1900}
1901
1902/// FindElementAndOffset - Return the index of the element containing Offset
1903/// within the specified type, which must be either a struct or an array.
1904/// Sets T to the type of the element and Offset to the offset within that
1905/// element.  IdxTy is set to the type of the index result to be used in a
1906/// GEP instruction.
1907uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
1908                                    const Type *&IdxTy) {
1909  uint64_t Idx = 0;
1910  if (const StructType *ST = dyn_cast<StructType>(T)) {
1911    const StructLayout *Layout = TD->getStructLayout(ST);
1912    Idx = Layout->getElementContainingOffset(Offset);
1913    T = ST->getContainedType(Idx);
1914    Offset -= Layout->getElementOffset(Idx);
1915    IdxTy = Type::getInt32Ty(T->getContext());
1916    return Idx;
1917  }
1918  const ArrayType *AT = cast<ArrayType>(T);
1919  T = AT->getElementType();
1920  uint64_t EltSize = TD->getTypeAllocSize(T);
1921  Idx = Offset / EltSize;
1922  Offset -= Idx * EltSize;
1923  IdxTy = Type::getInt64Ty(T->getContext());
1924  return Idx;
1925}
1926
1927/// RewriteGEP - Check if this GEP instruction moves the pointer across
1928/// elements of the alloca that are being split apart, and if so, rewrite
1929/// the GEP to be relative to the new element.
1930void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
1931                      SmallVector<AllocaInst*, 32> &NewElts) {
1932  uint64_t OldOffset = Offset;
1933  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
1934  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
1935                                 &Indices[0], Indices.size());
1936
1937  RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
1938
1939  const Type *T = AI->getAllocatedType();
1940  const Type *IdxTy;
1941  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
1942  if (GEPI->getOperand(0) == AI)
1943    OldIdx = ~0ULL; // Force the GEP to be rewritten.
1944
1945  T = AI->getAllocatedType();
1946  uint64_t EltOffset = Offset;
1947  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
1948
1949  // If this GEP does not move the pointer across elements of the alloca
1950  // being split, then it does not needs to be rewritten.
1951  if (Idx == OldIdx)
1952    return;
1953
1954  const Type *i32Ty = Type::getInt32Ty(AI->getContext());
1955  SmallVector<Value*, 8> NewArgs;
1956  NewArgs.push_back(Constant::getNullValue(i32Ty));
1957  while (EltOffset != 0) {
1958    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
1959    NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
1960  }
1961  Instruction *Val = NewElts[Idx];
1962  if (NewArgs.size() > 1) {
1963    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
1964                                            NewArgs.end(), "", GEPI);
1965    Val->takeName(GEPI);
1966  }
1967  if (Val->getType() != GEPI->getType())
1968    Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI);
1969  GEPI->replaceAllUsesWith(Val);
1970  DeadInsts.push_back(GEPI);
1971}
1972
1973/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
1974/// Rewrite it to copy or set the elements of the scalarized memory.
1975void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
1976                                        AllocaInst *AI,
1977                                        SmallVector<AllocaInst*, 32> &NewElts) {
1978  // If this is a memcpy/memmove, construct the other pointer as the
1979  // appropriate type.  The "Other" pointer is the pointer that goes to memory
1980  // that doesn't have anything to do with the alloca that we are promoting. For
1981  // memset, this Value* stays null.
1982  Value *OtherPtr = 0;
1983  unsigned MemAlignment = MI->getAlignment();
1984  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
1985    if (Inst == MTI->getRawDest())
1986      OtherPtr = MTI->getRawSource();
1987    else {
1988      assert(Inst == MTI->getRawSource());
1989      OtherPtr = MTI->getRawDest();
1990    }
1991  }
1992
1993  // If there is an other pointer, we want to convert it to the same pointer
1994  // type as AI has, so we can GEP through it safely.
1995  if (OtherPtr) {
1996    unsigned AddrSpace =
1997      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
1998
1999    // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
2000    // optimization, but it's also required to detect the corner case where
2001    // both pointer operands are referencing the same memory, and where
2002    // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
2003    // function is only called for mem intrinsics that access the whole
2004    // aggregate, so non-zero GEPs are not an issue here.)
2005    OtherPtr = OtherPtr->stripPointerCasts();
2006
2007    // Copying the alloca to itself is a no-op: just delete it.
2008    if (OtherPtr == AI || OtherPtr == NewElts[0]) {
2009      // This code will run twice for a no-op memcpy -- once for each operand.
2010      // Put only one reference to MI on the DeadInsts list.
2011      for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(),
2012             E = DeadInsts.end(); I != E; ++I)
2013        if (*I == MI) return;
2014      DeadInsts.push_back(MI);
2015      return;
2016    }
2017
2018    // If the pointer is not the right type, insert a bitcast to the right
2019    // type.
2020    const Type *NewTy =
2021      PointerType::get(AI->getType()->getElementType(), AddrSpace);
2022
2023    if (OtherPtr->getType() != NewTy)
2024      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
2025  }
2026
2027  // Process each element of the aggregate.
2028  bool SROADest = MI->getRawDest() == Inst;
2029
2030  Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
2031
2032  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
2033    // If this is a memcpy/memmove, emit a GEP of the other element address.
2034    Value *OtherElt = 0;
2035    unsigned OtherEltAlign = MemAlignment;
2036
2037    if (OtherPtr) {
2038      Value *Idx[2] = { Zero,
2039                      ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
2040      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
2041                                              OtherPtr->getName()+"."+Twine(i),
2042                                                   MI);
2043      uint64_t EltOffset;
2044      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
2045      const Type *OtherTy = OtherPtrTy->getElementType();
2046      if (const StructType *ST = dyn_cast<StructType>(OtherTy)) {
2047        EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
2048      } else {
2049        const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
2050        EltOffset = TD->getTypeAllocSize(EltTy)*i;
2051      }
2052
2053      // The alignment of the other pointer is the guaranteed alignment of the
2054      // element, which is affected by both the known alignment of the whole
2055      // mem intrinsic and the alignment of the element.  If the alignment of
2056      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
2057      // known alignment is just 4 bytes.
2058      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
2059    }
2060
2061    Value *EltPtr = NewElts[i];
2062    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
2063
2064    // If we got down to a scalar, insert a load or store as appropriate.
2065    if (EltTy->isSingleValueType()) {
2066      if (isa<MemTransferInst>(MI)) {
2067        if (SROADest) {
2068          // From Other to Alloca.
2069          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
2070          new StoreInst(Elt, EltPtr, MI);
2071        } else {
2072          // From Alloca to Other.
2073          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
2074          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
2075        }
2076        continue;
2077      }
2078      assert(isa<MemSetInst>(MI));
2079
2080      // If the stored element is zero (common case), just store a null
2081      // constant.
2082      Constant *StoreVal;
2083      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
2084        if (CI->isZero()) {
2085          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
2086        } else {
2087          // If EltTy is a vector type, get the element type.
2088          const Type *ValTy = EltTy->getScalarType();
2089
2090          // Construct an integer with the right value.
2091          unsigned EltSize = TD->getTypeSizeInBits(ValTy);
2092          APInt OneVal(EltSize, CI->getZExtValue());
2093          APInt TotalVal(OneVal);
2094          // Set each byte.
2095          for (unsigned i = 0; 8*i < EltSize; ++i) {
2096            TotalVal = TotalVal.shl(8);
2097            TotalVal |= OneVal;
2098          }
2099
2100          // Convert the integer value to the appropriate type.
2101          StoreVal = ConstantInt::get(CI->getContext(), TotalVal);
2102          if (ValTy->isPointerTy())
2103            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
2104          else if (ValTy->isFloatingPointTy())
2105            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
2106          assert(StoreVal->getType() == ValTy && "Type mismatch!");
2107
2108          // If the requested value was a vector constant, create it.
2109          if (EltTy != ValTy) {
2110            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
2111            SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
2112            StoreVal = ConstantVector::get(Elts);
2113          }
2114        }
2115        new StoreInst(StoreVal, EltPtr, MI);
2116        continue;
2117      }
2118      // Otherwise, if we're storing a byte variable, use a memset call for
2119      // this element.
2120    }
2121
2122    unsigned EltSize = TD->getTypeAllocSize(EltTy);
2123
2124    IRBuilder<> Builder(MI);
2125
2126    // Finally, insert the meminst for this element.
2127    if (isa<MemSetInst>(MI)) {
2128      Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize,
2129                           MI->isVolatile());
2130    } else {
2131      assert(isa<MemTransferInst>(MI));
2132      Value *Dst = SROADest ? EltPtr : OtherElt;  // Dest ptr
2133      Value *Src = SROADest ? OtherElt : EltPtr;  // Src ptr
2134
2135      if (isa<MemCpyInst>(MI))
2136        Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile());
2137      else
2138        Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile());
2139    }
2140  }
2141  DeadInsts.push_back(MI);
2142}
2143
2144/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
2145/// overwrites the entire allocation.  Extract out the pieces of the stored
2146/// integer and store them individually.
2147void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
2148                                         SmallVector<AllocaInst*, 32> &NewElts){
2149  // Extract each element out of the integer according to its structure offset
2150  // and store the element value to the individual alloca.
2151  Value *SrcVal = SI->getOperand(0);
2152  const Type *AllocaEltTy = AI->getAllocatedType();
2153  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
2154
2155  IRBuilder<> Builder(SI);
2156
2157  // Handle tail padding by extending the operand
2158  if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
2159    SrcVal = Builder.CreateZExt(SrcVal,
2160                            IntegerType::get(SI->getContext(), AllocaSizeBits));
2161
2162  DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI
2163               << '\n');
2164
2165  // There are two forms here: AI could be an array or struct.  Both cases
2166  // have different ways to compute the element offset.
2167  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
2168    const StructLayout *Layout = TD->getStructLayout(EltSTy);
2169
2170    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
2171      // Get the number of bits to shift SrcVal to get the value.
2172      const Type *FieldTy = EltSTy->getElementType(i);
2173      uint64_t Shift = Layout->getElementOffsetInBits(i);
2174
2175      if (TD->isBigEndian())
2176        Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy);
2177
2178      Value *EltVal = SrcVal;
2179      if (Shift) {
2180        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
2181        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
2182      }
2183
2184      // Truncate down to an integer of the right size.
2185      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
2186
2187      // Ignore zero sized fields like {}, they obviously contain no data.
2188      if (FieldSizeBits == 0) continue;
2189
2190      if (FieldSizeBits != AllocaSizeBits)
2191        EltVal = Builder.CreateTrunc(EltVal,
2192                             IntegerType::get(SI->getContext(), FieldSizeBits));
2193      Value *DestField = NewElts[i];
2194      if (EltVal->getType() == FieldTy) {
2195        // Storing to an integer field of this size, just do it.
2196      } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) {
2197        // Bitcast to the right element type (for fp/vector values).
2198        EltVal = Builder.CreateBitCast(EltVal, FieldTy);
2199      } else {
2200        // Otherwise, bitcast the dest pointer (for aggregates).
2201        DestField = Builder.CreateBitCast(DestField,
2202                                     PointerType::getUnqual(EltVal->getType()));
2203      }
2204      new StoreInst(EltVal, DestField, SI);
2205    }
2206
2207  } else {
2208    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
2209    const Type *ArrayEltTy = ATy->getElementType();
2210    uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
2211    uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
2212
2213    uint64_t Shift;
2214
2215    if (TD->isBigEndian())
2216      Shift = AllocaSizeBits-ElementOffset;
2217    else
2218      Shift = 0;
2219
2220    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
2221      // Ignore zero sized fields like {}, they obviously contain no data.
2222      if (ElementSizeBits == 0) continue;
2223
2224      Value *EltVal = SrcVal;
2225      if (Shift) {
2226        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
2227        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
2228      }
2229
2230      // Truncate down to an integer of the right size.
2231      if (ElementSizeBits != AllocaSizeBits)
2232        EltVal = Builder.CreateTrunc(EltVal,
2233                                     IntegerType::get(SI->getContext(),
2234                                                      ElementSizeBits));
2235      Value *DestField = NewElts[i];
2236      if (EltVal->getType() == ArrayEltTy) {
2237        // Storing to an integer field of this size, just do it.
2238      } else if (ArrayEltTy->isFloatingPointTy() ||
2239                 ArrayEltTy->isVectorTy()) {
2240        // Bitcast to the right element type (for fp/vector values).
2241        EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy);
2242      } else {
2243        // Otherwise, bitcast the dest pointer (for aggregates).
2244        DestField = Builder.CreateBitCast(DestField,
2245                                     PointerType::getUnqual(EltVal->getType()));
2246      }
2247      new StoreInst(EltVal, DestField, SI);
2248
2249      if (TD->isBigEndian())
2250        Shift -= ElementOffset;
2251      else
2252        Shift += ElementOffset;
2253    }
2254  }
2255
2256  DeadInsts.push_back(SI);
2257}
2258
2259/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
2260/// an integer.  Load the individual pieces to form the aggregate value.
2261void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
2262                                        SmallVector<AllocaInst*, 32> &NewElts) {
2263  // Extract each element out of the NewElts according to its structure offset
2264  // and form the result value.
2265  const Type *AllocaEltTy = AI->getAllocatedType();
2266  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
2267
2268  DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
2269               << '\n');
2270
2271  // There are two forms here: AI could be an array or struct.  Both cases
2272  // have different ways to compute the element offset.
2273  const StructLayout *Layout = 0;
2274  uint64_t ArrayEltBitOffset = 0;
2275  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
2276    Layout = TD->getStructLayout(EltSTy);
2277  } else {
2278    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
2279    ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
2280  }
2281
2282  Value *ResultVal =
2283    Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits));
2284
2285  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
2286    // Load the value from the alloca.  If the NewElt is an aggregate, cast
2287    // the pointer to an integer of the same size before doing the load.
2288    Value *SrcField = NewElts[i];
2289    const Type *FieldTy =
2290      cast<PointerType>(SrcField->getType())->getElementType();
2291    uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
2292
2293    // Ignore zero sized fields like {}, they obviously contain no data.
2294    if (FieldSizeBits == 0) continue;
2295
2296    const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
2297                                                     FieldSizeBits);
2298    if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
2299        !FieldTy->isVectorTy())
2300      SrcField = new BitCastInst(SrcField,
2301                                 PointerType::getUnqual(FieldIntTy),
2302                                 "", LI);
2303    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
2304
2305    // If SrcField is a fp or vector of the right size but that isn't an
2306    // integer type, bitcast to an integer so we can shift it.
2307    if (SrcField->getType() != FieldIntTy)
2308      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
2309
2310    // Zero extend the field to be the same size as the final alloca so that
2311    // we can shift and insert it.
2312    if (SrcField->getType() != ResultVal->getType())
2313      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
2314
2315    // Determine the number of bits to shift SrcField.
2316    uint64_t Shift;
2317    if (Layout) // Struct case.
2318      Shift = Layout->getElementOffsetInBits(i);
2319    else  // Array case.
2320      Shift = i*ArrayEltBitOffset;
2321
2322    if (TD->isBigEndian())
2323      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
2324
2325    if (Shift) {
2326      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
2327      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
2328    }
2329
2330    // Don't create an 'or x, 0' on the first iteration.
2331    if (!isa<Constant>(ResultVal) ||
2332        !cast<Constant>(ResultVal)->isNullValue())
2333      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
2334    else
2335      ResultVal = SrcField;
2336  }
2337
2338  // Handle tail padding by truncating the result
2339  if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
2340    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
2341
2342  LI->replaceAllUsesWith(ResultVal);
2343  DeadInsts.push_back(LI);
2344}
2345
2346/// HasPadding - Return true if the specified type has any structure or
2347/// alignment padding in between the elements that would be split apart
2348/// by SROA; return false otherwise.
2349static bool HasPadding(const Type *Ty, const TargetData &TD) {
2350  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2351    Ty = ATy->getElementType();
2352    return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
2353  }
2354
2355  // SROA currently handles only Arrays and Structs.
2356  const StructType *STy = cast<StructType>(Ty);
2357  const StructLayout *SL = TD.getStructLayout(STy);
2358  unsigned PrevFieldBitOffset = 0;
2359  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
2360    unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
2361
2362    // Check to see if there is any padding between this element and the
2363    // previous one.
2364    if (i) {
2365      unsigned PrevFieldEnd =
2366        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
2367      if (PrevFieldEnd < FieldBitOffset)
2368        return true;
2369    }
2370    PrevFieldBitOffset = FieldBitOffset;
2371  }
2372  // Check for tail padding.
2373  if (unsigned EltCount = STy->getNumElements()) {
2374    unsigned PrevFieldEnd = PrevFieldBitOffset +
2375      TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
2376    if (PrevFieldEnd < SL->getSizeInBits())
2377      return true;
2378  }
2379  return false;
2380}
2381
2382/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
2383/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
2384/// or 1 if safe after canonicalization has been performed.
2385bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
2386  // Loop over the use list of the alloca.  We can only transform it if all of
2387  // the users are safe to transform.
2388  AllocaInfo Info(AI);
2389
2390  isSafeForScalarRepl(AI, 0, Info);
2391  if (Info.isUnsafe) {
2392    DEBUG(dbgs() << "Cannot transform: " << *AI << '\n');
2393    return false;
2394  }
2395
2396  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
2397  // source and destination, we have to be careful.  In particular, the memcpy
2398  // could be moving around elements that live in structure padding of the LLVM
2399  // types, but may actually be used.  In these cases, we refuse to promote the
2400  // struct.
2401  if (Info.isMemCpySrc && Info.isMemCpyDst &&
2402      HasPadding(AI->getAllocatedType(), *TD))
2403    return false;
2404
2405  // If the alloca never has an access to just *part* of it, but is accessed
2406  // via loads and stores, then we should use ConvertToScalarInfo to promote
2407  // the alloca instead of promoting each piece at a time and inserting fission
2408  // and fusion code.
2409  if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
2410    // If the struct/array just has one element, use basic SRoA.
2411    if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
2412      if (ST->getNumElements() > 1) return false;
2413    } else {
2414      if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
2415        return false;
2416    }
2417  }
2418
2419  return true;
2420}
2421
2422
2423
2424/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to
2425/// some part of a constant global variable.  This intentionally only accepts
2426/// constant expressions because we don't can't rewrite arbitrary instructions.
2427static bool PointsToConstantGlobal(Value *V) {
2428  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
2429    return GV->isConstant();
2430  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
2431    if (CE->getOpcode() == Instruction::BitCast ||
2432        CE->getOpcode() == Instruction::GetElementPtr)
2433      return PointsToConstantGlobal(CE->getOperand(0));
2434  return false;
2435}
2436
2437/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
2438/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
2439/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
2440/// track of whether it moves the pointer (with isOffset) but otherwise traverse
2441/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
2442/// the alloca, and if the source pointer is a pointer to a constant global, we
2443/// can optimize this.
2444static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
2445                                           bool isOffset) {
2446  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
2447    User *U = cast<Instruction>(*UI);
2448
2449    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
2450      // Ignore non-volatile loads, they are always ok.
2451      if (LI->isVolatile()) return false;
2452      continue;
2453    }
2454
2455    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
2456      // If uses of the bitcast are ok, we are ok.
2457      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset))
2458        return false;
2459      continue;
2460    }
2461    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
2462      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
2463      // doesn't, it does.
2464      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
2465                                         isOffset || !GEP->hasAllZeroIndices()))
2466        return false;
2467      continue;
2468    }
2469
2470    if (CallSite CS = U) {
2471      // If this is a readonly/readnone call site, then we know it is just a
2472      // load and we can ignore it.
2473      if (CS.onlyReadsMemory())
2474        continue;
2475
2476      // If this is the function being called then we treat it like a load and
2477      // ignore it.
2478      if (CS.isCallee(UI))
2479        continue;
2480
2481      // If this is being passed as a byval argument, the caller is making a
2482      // copy, so it is only a read of the alloca.
2483      unsigned ArgNo = CS.getArgumentNo(UI);
2484      if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
2485        continue;
2486    }
2487
2488    // If this is isn't our memcpy/memmove, reject it as something we can't
2489    // handle.
2490    MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
2491    if (MI == 0)
2492      return false;
2493
2494    // If the transfer is using the alloca as a source of the transfer, then
2495    // ignore it since it is a load (unless the transfer is volatile).
2496    if (UI.getOperandNo() == 1) {
2497      if (MI->isVolatile()) return false;
2498      continue;
2499    }
2500
2501    // If we already have seen a copy, reject the second one.
2502    if (TheCopy) return false;
2503
2504    // If the pointer has been offset from the start of the alloca, we can't
2505    // safely handle this.
2506    if (isOffset) return false;
2507
2508    // If the memintrinsic isn't using the alloca as the dest, reject it.
2509    if (UI.getOperandNo() != 0) return false;
2510
2511    // If the source of the memcpy/move is not a constant global, reject it.
2512    if (!PointsToConstantGlobal(MI->getSource()))
2513      return false;
2514
2515    // Otherwise, the transform is safe.  Remember the copy instruction.
2516    TheCopy = MI;
2517  }
2518  return true;
2519}
2520
2521/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
2522/// modified by a copy from a constant global.  If we can prove this, we can
2523/// replace any uses of the alloca with uses of the global directly.
2524MemTransferInst *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) {
2525  MemTransferInst *TheCopy = 0;
2526  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false))
2527    return TheCopy;
2528  return 0;
2529}
2530