LoopVectorize.cpp revision b1bf1eeede72b8c93505dd80fdf21aed0e205c7d
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11// and generates target-independent LLVM-IR. Legalization of the IR is done
12// in the codegen. However, the vectorizes uses (will use) the codegen
13// interfaces to generate IR that is likely to result in an optimal binary.
14//
15// The loop vectorizer combines consecutive loop iteration into a single
16// 'wide' iteration. After this transformation the index is incremented
17// by the SIMD vector width, and not by one.
18//
19// This pass has three parts:
20// 1. The main loop pass that drives the different parts.
21// 2. LoopVectorizationLegality - A unit that checks for the legality
22//    of the vectorization.
23// 3. SingleBlockLoopVectorizer - A unit that performs the actual
24//    widening of instructions.
25// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26//    of vectorization. It decides on the optimal vector width, which
27//    can be one, if vectorization is not profitable.
28//===----------------------------------------------------------------------===//
29//
30// The reduction-variable vectorization is based on the paper:
31//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
32//
33// Variable uniformity checks are inspired by:
34// Karrenberg, R. and Hack, S. Whole Function Vectorization.
35//
36// Other ideas/concepts are from:
37//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
38//
39//===----------------------------------------------------------------------===//
40#define LV_NAME "loop-vectorize"
41#define DEBUG_TYPE LV_NAME
42#include "llvm/Constants.h"
43#include "llvm/DerivedTypes.h"
44#include "llvm/Instructions.h"
45#include "llvm/LLVMContext.h"
46#include "llvm/Pass.h"
47#include "llvm/Analysis/LoopPass.h"
48#include "llvm/Value.h"
49#include "llvm/Function.h"
50#include "llvm/Analysis/Verifier.h"
51#include "llvm/Module.h"
52#include "llvm/Type.h"
53#include "llvm/ADT/SmallVector.h"
54#include "llvm/ADT/StringExtras.h"
55#include "llvm/Analysis/AliasAnalysis.h"
56#include "llvm/Analysis/AliasSetTracker.h"
57#include "llvm/Analysis/ScalarEvolution.h"
58#include "llvm/Analysis/Dominators.h"
59#include "llvm/Analysis/ScalarEvolutionExpressions.h"
60#include "llvm/Analysis/ScalarEvolutionExpander.h"
61#include "llvm/Analysis/LoopInfo.h"
62#include "llvm/Analysis/ValueTracking.h"
63#include "llvm/Transforms/Scalar.h"
64#include "llvm/Transforms/Utils/BasicBlockUtils.h"
65#include "llvm/TargetTransformInfo.h"
66#include "llvm/Support/CommandLine.h"
67#include "llvm/Support/Debug.h"
68#include "llvm/Support/raw_ostream.h"
69#include "llvm/DataLayout.h"
70#include "llvm/Transforms/Utils/Local.h"
71#include <algorithm>
72using namespace llvm;
73
74static cl::opt<unsigned>
75VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
76          cl::desc("Set the default vectorization width. Zero is autoselect."));
77
78/// We don't vectorize loops with a known constant trip count below this number.
79const unsigned TinyTripCountThreshold = 16;
80
81/// When performing a runtime memory check, do not check more than this
82/// numner of pointers. Notice that the check is quadratic!
83const unsigned RuntimeMemoryCheckThreshold = 2;
84
85namespace {
86
87// Forward declarations.
88class LoopVectorizationLegality;
89class LoopVectorizationCostModel;
90
91/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
92/// block to a specified vectorization factor (VF).
93/// This class performs the widening of scalars into vectors, or multiple
94/// scalars. This class also implements the following features:
95/// * It inserts an epilogue loop for handling loops that don't have iteration
96///   counts that are known to be a multiple of the vectorization factor.
97/// * It handles the code generation for reduction variables.
98/// * Scalarization (implementation using scalars) of un-vectorizable
99///   instructions.
100/// SingleBlockLoopVectorizer does not perform any vectorization-legality
101/// checks, and relies on the caller to check for the different legality
102/// aspects. The SingleBlockLoopVectorizer relies on the
103/// LoopVectorizationLegality class to provide information about the induction
104/// and reduction variables that were found to a given vectorization factor.
105class SingleBlockLoopVectorizer {
106public:
107  /// Ctor.
108  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
109                            DominatorTree *dt, LPPassManager *Lpm,
110                            unsigned VecWidth):
111  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
112  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
113
114  // Perform the actual loop widening (vectorization).
115  void vectorize(LoopVectorizationLegality *Legal) {
116    ///Create a new empty loop. Unlink the old loop and connect the new one.
117    createEmptyLoop(Legal);
118    /// Widen each instruction in the old loop to a new one in the new loop.
119    /// Use the Legality module to find the induction and reduction variables.
120    vectorizeLoop(Legal);
121    // register the new loop.
122    updateAnalysis();
123 }
124
125private:
126  /// Create an empty loop, based on the loop ranges of the old loop.
127  void createEmptyLoop(LoopVectorizationLegality *Legal);
128  /// Copy and widen the instructions from the old loop.
129  void vectorizeLoop(LoopVectorizationLegality *Legal);
130  /// Insert the new loop to the loop hierarchy and pass manager.
131  void updateAnalysis();
132
133  /// This instruction is un-vectorizable. Implement it as a sequence
134  /// of scalars.
135  void scalarizeInstruction(Instruction *Instr);
136
137  /// Create a broadcast instruction. This method generates a broadcast
138  /// instruction (shuffle) for loop invariant values and for the induction
139  /// value. If this is the induction variable then we extend it to N, N+1, ...
140  /// this is needed because each iteration in the loop corresponds to a SIMD
141  /// element.
142  Value *getBroadcastInstrs(Value *V);
143
144  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
145  /// for each element in the vector. Starting from zero.
146  Value *getConsecutiveVector(Value* Val);
147
148  /// When we go over instructions in the basic block we rely on previous
149  /// values within the current basic block or on loop invariant values.
150  /// When we widen (vectorize) values we place them in the map. If the values
151  /// are not within the map, they have to be loop invariant, so we simply
152  /// broadcast them into a vector.
153  Value *getVectorValue(Value *V);
154
155  /// Get a uniform vector of constant integers. We use this to get
156  /// vectors of ones and zeros for the reduction code.
157  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
158
159  typedef DenseMap<Value*, Value*> ValueMap;
160
161  /// The original loop.
162  Loop *OrigLoop;
163  // Scev analysis to use.
164  ScalarEvolution *SE;
165  // Loop Info.
166  LoopInfo *LI;
167  // Dominator Tree.
168  DominatorTree *DT;
169  // Loop Pass Manager;
170  LPPassManager *LPM;
171  // The vectorization factor to use.
172  unsigned VF;
173
174  // The builder that we use
175  IRBuilder<> Builder;
176
177  // --- Vectorization state ---
178
179  /// The vector-loop preheader.
180  BasicBlock *LoopVectorPreHeader;
181  /// The scalar-loop preheader.
182  BasicBlock *LoopScalarPreHeader;
183  /// Middle Block between the vector and the scalar.
184  BasicBlock *LoopMiddleBlock;
185  ///The ExitBlock of the scalar loop.
186  BasicBlock *LoopExitBlock;
187  ///The vector loop body.
188  BasicBlock *LoopVectorBody;
189  ///The scalar loop body.
190  BasicBlock *LoopScalarBody;
191  ///The first bypass block.
192  BasicBlock *LoopBypassBlock;
193
194  /// The new Induction variable which was added to the new block.
195  PHINode *Induction;
196  /// The induction variable of the old basic block.
197  PHINode *OldInduction;
198  // Maps scalars to widened vectors.
199  ValueMap WidenMap;
200};
201
202/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
203/// to what vectorization factor.
204/// This class does not look at the profitability of vectorization, only the
205/// legality. This class has two main kinds of checks:
206/// * Memory checks - The code in canVectorizeMemory checks if vectorization
207///   will change the order of memory accesses in a way that will change the
208///   correctness of the program.
209/// * Scalars checks - The code in canVectorizeBlock checks for a number
210///   of different conditions, such as the availability of a single induction
211///   variable, that all types are supported and vectorize-able, etc.
212/// This code reflects the capabilities of SingleBlockLoopVectorizer.
213/// This class is also used by SingleBlockLoopVectorizer for identifying
214/// induction variable and the different reduction variables.
215class LoopVectorizationLegality {
216public:
217  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
218  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
219
220  /// This represents the kinds of reductions that we support.
221  enum ReductionKind {
222    NoReduction, /// Not a reduction.
223    IntegerAdd,  /// Sum of numbers.
224    IntegerMult, /// Product of numbers.
225    IntegerOr,   /// Bitwise or logical OR of numbers.
226    IntegerAnd,  /// Bitwise or logical AND of numbers.
227    IntegerXor   /// Bitwise or logical XOR of numbers.
228  };
229
230  /// This POD struct holds information about reduction variables.
231  struct ReductionDescriptor {
232    // Default C'tor
233    ReductionDescriptor():
234    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
235
236    // C'tor.
237    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
238    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
239
240    // The starting value of the reduction.
241    // It does not have to be zero!
242    Value *StartValue;
243    // The instruction who's value is used outside the loop.
244    Instruction *LoopExitInstr;
245    // The kind of the reduction.
246    ReductionKind Kind;
247  };
248
249  // This POD struct holds information about the memory runtime legality
250  // check that a group of pointers do not overlap.
251  struct RuntimePointerCheck {
252    /// This flag indicates if we need to add the runtime check.
253    bool Need;
254    /// Holds the pointers that we need to check.
255    SmallVector<Value*, 2> Pointers;
256  };
257
258  /// ReductionList contains the reduction descriptors for all
259  /// of the reductions that were found in the loop.
260  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
261
262  /// Returns true if it is legal to vectorize this loop.
263  /// This does not mean that it is profitable to vectorize this
264  /// loop, only that it is legal to do so.
265  bool canVectorize();
266
267  /// Returns the Induction variable.
268  PHINode *getInduction() {return Induction;}
269
270  /// Returns the reduction variables found in the loop.
271  ReductionList *getReductionVars() { return &Reductions; }
272
273  /// Check if the pointer returned by this GEP is consecutive
274  /// when the index is vectorized. This happens when the last
275  /// index of the GEP is consecutive, like the induction variable.
276  /// This check allows us to vectorize A[idx] into a wide load/store.
277  bool isConsecutiveGep(Value *Ptr);
278
279  /// Returns true if the value V is uniform within the loop.
280  bool isUniform(Value *V);
281
282  /// Returns true if this instruction will remain scalar after vectorization.
283  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
284
285  /// Returns the information that we collected about runtime memory check.
286  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
287private:
288  /// Check if a single basic block loop is vectorizable.
289  /// At this point we know that this is a loop with a constant trip count
290  /// and we only need to check individual instructions.
291  bool canVectorizeBlock(BasicBlock &BB);
292
293  /// When we vectorize loops we may change the order in which
294  /// we read and write from memory. This method checks if it is
295  /// legal to vectorize the code, considering only memory constrains.
296  /// Returns true if BB is vectorizable
297  bool canVectorizeMemory(BasicBlock &BB);
298
299  /// Returns True, if 'Phi' is the kind of reduction variable for type
300  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
301  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
302  /// Returns true if the instruction I can be a reduction variable of type
303  /// 'Kind'.
304  bool isReductionInstr(Instruction *I, ReductionKind Kind);
305  /// Returns True, if 'Phi' is an induction variable.
306  bool isInductionVariable(PHINode *Phi);
307  /// Return true if we
308  bool hasComputableBounds(Value *Ptr);
309
310  /// The loop that we evaluate.
311  Loop *TheLoop;
312  /// Scev analysis.
313  ScalarEvolution *SE;
314  /// DataLayout analysis.
315  DataLayout *DL;
316
317  //  ---  vectorization state --- //
318
319  /// Holds the induction variable.
320  PHINode *Induction;
321  /// Holds the reduction variables.
322  ReductionList Reductions;
323  /// Allowed outside users. This holds the reduction
324  /// vars which can be accessed from outside the loop.
325  SmallPtrSet<Value*, 4> AllowedExit;
326  /// This set holds the variables which are known to be uniform after
327  /// vectorization.
328  SmallPtrSet<Instruction*, 4> Uniforms;
329  /// We need to check that all of the pointers in this list are disjoint
330  /// at runtime.
331  RuntimePointerCheck PtrRtCheck;
332};
333
334/// LoopVectorizationCostModel - estimates the expected speedups due to
335/// vectorization.
336/// In many cases vectorization is not profitable. This can happen because
337/// of a number of reasons. In this class we mainly attempt to predict
338/// the expected speedup/slowdowns due to the supported instruction set.
339/// We use the VectorTargetTransformInfo to query the different backends
340/// for the cost of different operations.
341class LoopVectorizationCostModel {
342public:
343  /// C'tor.
344  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
345                             LoopVectorizationLegality *Leg,
346                             const VectorTargetTransformInfo *Vtti):
347  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
348
349  /// Returns the most profitable vectorization factor for the loop that is
350  /// smaller or equal to the VF argument. This method checks every power
351  /// of two up to VF.
352  unsigned findBestVectorizationFactor(unsigned VF = 8);
353
354private:
355  /// Returns the expected execution cost. The unit of the cost does
356  /// not matter because we use the 'cost' units to compare different
357  /// vector widths. The cost that is returned is *not* normalized by
358  /// the factor width.
359  unsigned expectedCost(unsigned VF);
360
361  /// Returns the execution time cost of an instruction for a given vector
362  /// width. Vector width of one means scalar.
363  unsigned getInstructionCost(Instruction *I, unsigned VF);
364
365  /// A helper function for converting Scalar types to vector types.
366  /// If the incoming type is void, we return void. If the VF is 1, we return
367  /// the scalar type.
368  static Type* ToVectorTy(Type *Scalar, unsigned VF);
369
370  /// The loop that we evaluate.
371  Loop *TheLoop;
372  /// Scev analysis.
373  ScalarEvolution *SE;
374
375  /// Vectorization legality.
376  LoopVectorizationLegality *Legal;
377  /// Vector target information.
378  const VectorTargetTransformInfo *VTTI;
379};
380
381struct LoopVectorize : public LoopPass {
382  static char ID; // Pass identification, replacement for typeid
383
384  LoopVectorize() : LoopPass(ID) {
385    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
386  }
387
388  ScalarEvolution *SE;
389  DataLayout *DL;
390  LoopInfo *LI;
391  TargetTransformInfo *TTI;
392  DominatorTree *DT;
393
394  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
395    // We only vectorize innermost loops.
396    if (!L->empty())
397      return false;
398
399    SE = &getAnalysis<ScalarEvolution>();
400    DL = getAnalysisIfAvailable<DataLayout>();
401    LI = &getAnalysis<LoopInfo>();
402    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
403    DT = &getAnalysis<DominatorTree>();
404
405    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
406          L->getHeader()->getParent()->getName() << "\"\n");
407
408    // Check if it is legal to vectorize the loop.
409    LoopVectorizationLegality LVL(L, SE, DL);
410    if (!LVL.canVectorize()) {
411      DEBUG(dbgs() << "LV: Not vectorizing.\n");
412      return false;
413    }
414
415    // Select the preffered vectorization factor.
416    unsigned VF = 1;
417    if (VectorizationFactor == 0) {
418      const VectorTargetTransformInfo *VTTI = 0;
419      if (TTI)
420        VTTI = TTI->getVectorTargetTransformInfo();
421      // Use the cost model.
422      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
423      VF = CM.findBestVectorizationFactor();
424
425      if (VF == 1) {
426        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
427        return false;
428      }
429
430    } else {
431      // Use the user command flag.
432      VF = VectorizationFactor;
433    }
434
435    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
436          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
437          "\n");
438
439    // If we decided that it is *legal* to vectorizer the loop then do it.
440    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
441    LB.vectorize(&LVL);
442
443    DEBUG(verifyFunction(*L->getHeader()->getParent()));
444    return true;
445  }
446
447  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
448    LoopPass::getAnalysisUsage(AU);
449    AU.addRequiredID(LoopSimplifyID);
450    AU.addRequiredID(LCSSAID);
451    AU.addRequired<LoopInfo>();
452    AU.addRequired<ScalarEvolution>();
453    AU.addRequired<DominatorTree>();
454    AU.addPreserved<LoopInfo>();
455    AU.addPreserved<DominatorTree>();
456  }
457
458};
459
460Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
461  // Instructions that access the old induction variable
462  // actually want to get the new one.
463  if (V == OldInduction)
464    V = Induction;
465  // Create the types.
466  LLVMContext &C = V->getContext();
467  Type *VTy = VectorType::get(V->getType(), VF);
468  Type *I32 = IntegerType::getInt32Ty(C);
469  Constant *Zero = ConstantInt::get(I32, 0);
470  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
471  Value *UndefVal = UndefValue::get(VTy);
472  // Insert the value into a new vector.
473  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
474  // Broadcast the scalar into all locations in the vector.
475  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
476                                             "broadcast");
477  // We are accessing the induction variable. Make sure to promote the
478  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
479  if (V == Induction)
480    return getConsecutiveVector(Shuf);
481  return Shuf;
482}
483
484Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
485  assert(Val->getType()->isVectorTy() && "Must be a vector");
486  assert(Val->getType()->getScalarType()->isIntegerTy() &&
487         "Elem must be an integer");
488  // Create the types.
489  Type *ITy = Val->getType()->getScalarType();
490  VectorType *Ty = cast<VectorType>(Val->getType());
491  unsigned VLen = Ty->getNumElements();
492  SmallVector<Constant*, 8> Indices;
493
494  // Create a vector of consecutive numbers from zero to VF.
495  for (unsigned i = 0; i < VLen; ++i)
496    Indices.push_back(ConstantInt::get(ITy, i));
497
498  // Add the consecutive indices to the vector value.
499  Constant *Cv = ConstantVector::get(Indices);
500  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
501  return Builder.CreateAdd(Val, Cv, "induction");
502}
503
504bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
505  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
506  if (!Gep)
507    return false;
508
509  unsigned NumOperands = Gep->getNumOperands();
510  Value *LastIndex = Gep->getOperand(NumOperands - 1);
511
512  // Check that all of the gep indices are uniform except for the last.
513  for (unsigned i = 0; i < NumOperands - 1; ++i)
514    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
515      return false;
516
517  // We can emit wide load/stores only of the last index is the induction
518  // variable.
519  const SCEV *Last = SE->getSCEV(LastIndex);
520  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
521    const SCEV *Step = AR->getStepRecurrence(*SE);
522
523    // The memory is consecutive because the last index is consecutive
524    // and all other indices are loop invariant.
525    if (Step->isOne())
526      return true;
527  }
528
529  return false;
530}
531
532bool LoopVectorizationLegality::isUniform(Value *V) {
533  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
534}
535
536Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
537  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
538  // If we saved a vectorized copy of V, use it.
539  Value *&MapEntry = WidenMap[V];
540  if (MapEntry)
541    return MapEntry;
542
543  // Broadcast V and save the value for future uses.
544  Value *B = getBroadcastInstrs(V);
545  MapEntry = B;
546  return B;
547}
548
549Constant*
550SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
551  SmallVector<Constant*, 8> Indices;
552  // Create a vector of consecutive numbers from zero to VF.
553  for (unsigned i = 0; i < VF; ++i)
554    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
555
556  // Add the consecutive indices to the vector value.
557  return ConstantVector::get(Indices);
558}
559
560void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
561  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
562  // Holds vector parameters or scalars, in case of uniform vals.
563  SmallVector<Value*, 8> Params;
564
565  // Find all of the vectorized parameters.
566  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
567    Value *SrcOp = Instr->getOperand(op);
568
569    // If we are accessing the old induction variable, use the new one.
570    if (SrcOp == OldInduction) {
571      Params.push_back(getBroadcastInstrs(Induction));
572      continue;
573    }
574
575    // Try using previously calculated values.
576    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
577
578    // If the src is an instruction that appeared earlier in the basic block
579    // then it should already be vectorized.
580    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
581      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
582      // The parameter is a vector value from earlier.
583      Params.push_back(WidenMap[SrcInst]);
584    } else {
585      // The parameter is a scalar from outside the loop. Maybe even a constant.
586      Params.push_back(SrcOp);
587    }
588  }
589
590  assert(Params.size() == Instr->getNumOperands() &&
591         "Invalid number of operands");
592
593  // Does this instruction return a value ?
594  bool IsVoidRetTy = Instr->getType()->isVoidTy();
595  Value *VecResults = 0;
596
597  // If we have a return value, create an empty vector. We place the scalarized
598  // instructions in this vector.
599  if (!IsVoidRetTy)
600    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
601
602  // For each scalar that we create:
603  for (unsigned i = 0; i < VF; ++i) {
604    Instruction *Cloned = Instr->clone();
605    if (!IsVoidRetTy)
606      Cloned->setName(Instr->getName() + ".cloned");
607    // Replace the operands of the cloned instrucions with extracted scalars.
608    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
609      Value *Op = Params[op];
610      // Param is a vector. Need to extract the right lane.
611      if (Op->getType()->isVectorTy())
612        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
613      Cloned->setOperand(op, Op);
614    }
615
616    // Place the cloned scalar in the new loop.
617    Builder.Insert(Cloned);
618
619    // If the original scalar returns a value we need to place it in a vector
620    // so that future users will be able to use it.
621    if (!IsVoidRetTy)
622      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
623                                               Builder.getInt32(i));
624  }
625
626  if (!IsVoidRetTy)
627    WidenMap[Instr] = VecResults;
628}
629
630void
631SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
632  /*
633   In this function we generate a new loop. The new loop will contain
634   the vectorized instructions while the old loop will continue to run the
635   scalar remainder.
636
637    [ ] <-- vector loop bypass.
638  /  |
639 /   v
640|   [ ]     <-- vector pre header.
641|    |
642|    v
643|   [  ] \
644|   [  ]_|   <-- vector loop.
645|    |
646 \   v
647   >[ ]   <--- middle-block.
648  /  |
649 /   v
650|   [ ]     <--- new preheader.
651|    |
652|    v
653|   [ ] \
654|   [ ]_|   <-- old scalar loop to handle remainder.
655 \   |
656  \  v
657   >[ ]     <-- exit block.
658   ...
659   */
660
661  OldInduction = Legal->getInduction();
662  assert(OldInduction && "We must have a single phi node.");
663  Type *IdxTy = OldInduction->getType();
664
665  // Find the loop boundaries.
666  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
667  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
668
669  // Get the total trip count from the count by adding 1.
670  ExitCount = SE->getAddExpr(ExitCount,
671                             SE->getConstant(ExitCount->getType(), 1));
672  // We may need to extend the index in case there is a type mismatch.
673  // We know that the count starts at zero and does not overflow.
674  // We are using Zext because it should be less expensive.
675  if (ExitCount->getType() != IdxTy)
676    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
677
678  // This is the original scalar-loop preheader.
679  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
680  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
681  assert(ExitBlock && "Must have an exit block");
682
683  // The loop index does not have to start at Zero. It starts with this value.
684  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
685
686  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
687  assert(BypassBlock && "Invalid loop structure");
688
689  BasicBlock *VectorPH =
690      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
691  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
692                                                 "vector.body");
693
694  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
695                                                  "middle.block");
696  BasicBlock *ScalarPH =
697    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
698                                 "scalar.preheader");
699  // Find the induction variable.
700  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
701
702  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
703  // inside the loop.
704  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
705
706  // Generate the induction variable.
707  Induction = Builder.CreatePHI(IdxTy, 2, "index");
708  Constant *Step = ConstantInt::get(IdxTy, VF);
709
710  // Expand the trip count and place the new instructions in the preheader.
711  // Notice that the pre-header does not change, only the loop body.
712  SCEVExpander Exp(*SE, "induction");
713  Instruction *Loc = BypassBlock->getTerminator();
714
715  // Count holds the overall loop count (N).
716  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
717
718  // Add the start index to the loop count to get the new end index.
719  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
720
721  // Now we need to generate the expression for N - (N % VF), which is
722  // the part that the vectorized body will execute.
723  Constant *CIVF = ConstantInt::get(IdxTy, VF);
724  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
725  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
726  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
727                                                     "end.idx.rnd.down", Loc);
728
729  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
730  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
731                               IdxEndRoundDown,
732                               StartIdx,
733                               "cmp.zero", Loc);
734
735  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
736    Legal->getRuntimePointerCheck();
737  Value *MemoryRuntimeCheck = 0;
738  if (PtrRtCheck->Need) {
739    unsigned NumPointers = PtrRtCheck->Pointers.size();
740    SmallVector<Value* , 2> Starts;
741    SmallVector<Value* , 2> Ends;
742
743    // Use this type for pointer arithmetic.
744    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
745
746    for (unsigned i=0; i < NumPointers; ++i) {
747      Value *Ptr = PtrRtCheck->Pointers[i];
748      const SCEV *Sc = SE->getSCEV(Ptr);
749
750      if (SE->isLoopInvariant(Sc, OrigLoop)) {
751        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
752              *Ptr <<"\n");
753        Starts.push_back(Ptr);
754        Ends.push_back(Ptr);
755      } else {
756        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
757        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
758        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
759        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
760        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
761        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
762        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
763        Starts.push_back(Start);
764        Ends.push_back(End);
765      }
766    }
767
768    for (unsigned i=0; i < NumPointers; ++i) {
769      for (unsigned j=i+1; j < NumPointers; ++j) {
770        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
771                                      Starts[0], Ends[1], "bound0", Loc);
772        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
773                                      Starts[1], Ends[0], "bound1", Loc);
774        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
775                                                    "found.conflict", Loc);
776        if (MemoryRuntimeCheck) {
777          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
778                                                      MemoryRuntimeCheck,
779                                                      IsConflict,
780                                                      "conflict.rdx", Loc);
781        } else {
782          MemoryRuntimeCheck = IsConflict;
783        }
784      }
785    }
786  }// end of need-runtime-check code.
787
788  // If we are using memory runtime checks, include them in.
789  if (MemoryRuntimeCheck) {
790    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
791                                 "CntOrMem", Loc);
792  }
793
794  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
795  // Remove the old terminator.
796  Loc->eraseFromParent();
797
798  // We are going to resume the execution of the scalar loop.
799  // This PHI decides on what number to start. If we come from the
800  // vector loop then we need to start with the end index minus the
801  // index modulo VF. If we come from a bypass edge then we need to start
802  // from the real start.
803  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
804                                         MiddleBlock->getTerminator());
805  ResumeIndex->addIncoming(StartIdx, BypassBlock);
806  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
807
808  // Add a check in the middle block to see if we have completed
809  // all of the iterations in the first vector loop.
810  // If (N - N%VF) == N, then we *don't* need to run the remainder.
811  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
812                                ResumeIndex, "cmp.n",
813                                MiddleBlock->getTerminator());
814
815  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
816  // Remove the old terminator.
817  MiddleBlock->getTerminator()->eraseFromParent();
818
819  // Create i+1 and fill the PHINode.
820  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
821  Induction->addIncoming(StartIdx, VectorPH);
822  Induction->addIncoming(NextIdx, VecBody);
823  // Create the compare.
824  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
825  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
826
827  // Now we have two terminators. Remove the old one from the block.
828  VecBody->getTerminator()->eraseFromParent();
829
830  // Fix the scalar body iteration count.
831  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
832  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
833
834  // Get ready to start creating new instructions into the vectorized body.
835  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
836
837  // Register the new loop.
838  Loop* Lp = new Loop();
839  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
840
841  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
842
843  Loop *ParentLoop = OrigLoop->getParentLoop();
844  if (ParentLoop) {
845    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
846    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
847    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
848  }
849
850  // Save the state.
851  LoopVectorPreHeader = VectorPH;
852  LoopScalarPreHeader = ScalarPH;
853  LoopMiddleBlock = MiddleBlock;
854  LoopExitBlock = ExitBlock;
855  LoopVectorBody = VecBody;
856  LoopScalarBody = OldBasicBlock;
857  LoopBypassBlock = BypassBlock;
858}
859
860/// This function returns the identity element (or neutral element) for
861/// the operation K.
862static unsigned
863getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
864  switch (K) {
865  case LoopVectorizationLegality::IntegerXor:
866  case LoopVectorizationLegality::IntegerAdd:
867  case LoopVectorizationLegality::IntegerOr:
868    // Adding, Xoring, Oring zero to a number does not change it.
869    return 0;
870  case LoopVectorizationLegality::IntegerMult:
871    // Multiplying a number by 1 does not change it.
872    return 1;
873  case LoopVectorizationLegality::IntegerAnd:
874    // AND-ing a number with an all-1 value does not change it.
875    return -1;
876  default:
877    llvm_unreachable("Unknown reduction kind");
878  }
879}
880
881void
882SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
883  //===------------------------------------------------===//
884  //
885  // Notice: any optimization or new instruction that go
886  // into the code below should be also be implemented in
887  // the cost-model.
888  //
889  //===------------------------------------------------===//
890  typedef SmallVector<PHINode*, 4> PhiVector;
891  BasicBlock &BB = *OrigLoop->getHeader();
892  Constant *Zero = ConstantInt::get(
893    IntegerType::getInt32Ty(BB.getContext()), 0);
894
895  // In order to support reduction variables we need to be able to vectorize
896  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
897  // steages. First, we create a new vector PHI node with no incoming edges.
898  // We use this value when we vectorize all of the instructions that use the
899  // PHI. Next, after all of the instructions in the block are complete we
900  // add the new incoming edges to the PHI. At this point all of the
901  // instructions in the basic block are vectorized, so we can use them to
902  // construct the PHI.
903  PhiVector PHIsToFix;
904
905  // For each instruction in the old loop.
906  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
907    Instruction *Inst = it;
908
909    switch (Inst->getOpcode()) {
910      case Instruction::Br:
911        // Nothing to do for PHIs and BR, since we already took care of the
912        // loop control flow instructions.
913        continue;
914      case Instruction::PHI:{
915        PHINode* P = cast<PHINode>(Inst);
916        // Special handling for the induction var.
917        if (OldInduction == Inst)
918          continue;
919        // This is phase one of vectorizing PHIs.
920        // This has to be a reduction variable.
921        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
922        Type *VecTy = VectorType::get(Inst->getType(), VF);
923        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
924        PHIsToFix.push_back(P);
925        continue;
926      }
927      case Instruction::Add:
928      case Instruction::FAdd:
929      case Instruction::Sub:
930      case Instruction::FSub:
931      case Instruction::Mul:
932      case Instruction::FMul:
933      case Instruction::UDiv:
934      case Instruction::SDiv:
935      case Instruction::FDiv:
936      case Instruction::URem:
937      case Instruction::SRem:
938      case Instruction::FRem:
939      case Instruction::Shl:
940      case Instruction::LShr:
941      case Instruction::AShr:
942      case Instruction::And:
943      case Instruction::Or:
944      case Instruction::Xor: {
945        // Just widen binops.
946        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
947        Value *A = getVectorValue(Inst->getOperand(0));
948        Value *B = getVectorValue(Inst->getOperand(1));
949
950        // Use this vector value for all users of the original instruction.
951        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
952        WidenMap[Inst] = V;
953
954        // Update the NSW, NUW and Exact flags.
955        BinaryOperator *VecOp = cast<BinaryOperator>(V);
956        if (isa<OverflowingBinaryOperator>(BinOp)) {
957          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
958          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
959        }
960        if (isa<PossiblyExactOperator>(VecOp))
961          VecOp->setIsExact(BinOp->isExact());
962        break;
963      }
964      case Instruction::Select: {
965        // Widen selects.
966        // If the selector is loop invariant we can create a select
967        // instruction with a scalar condition. Otherwise, use vector-select.
968        Value *Cond = Inst->getOperand(0);
969        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
970
971        // The condition can be loop invariant  but still defined inside the
972        // loop. This means that we can't just use the original 'cond' value.
973        // We have to take the 'vectorized' value and pick the first lane.
974        // Instcombine will make this a no-op.
975        Cond = getVectorValue(Cond);
976        if (InvariantCond)
977          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
978
979        Value *Op0 = getVectorValue(Inst->getOperand(1));
980        Value *Op1 = getVectorValue(Inst->getOperand(2));
981        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
982        break;
983      }
984
985      case Instruction::ICmp:
986      case Instruction::FCmp: {
987        // Widen compares. Generate vector compares.
988        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
989        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
990        Value *A = getVectorValue(Inst->getOperand(0));
991        Value *B = getVectorValue(Inst->getOperand(1));
992        if (FCmp)
993          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
994        else
995          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
996        break;
997      }
998
999      case Instruction::Store: {
1000        // Attempt to issue a wide store.
1001        StoreInst *SI = dyn_cast<StoreInst>(Inst);
1002        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
1003        Value *Ptr = SI->getPointerOperand();
1004        unsigned Alignment = SI->getAlignment();
1005
1006        assert(!Legal->isUniform(Ptr) &&
1007               "We do not allow storing to uniform addresses");
1008
1009        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1010
1011        // This store does not use GEPs.
1012        if (!Legal->isConsecutiveGep(Gep)) {
1013          scalarizeInstruction(Inst);
1014          break;
1015        }
1016
1017        // The last index does not have to be the induction. It can be
1018        // consecutive and be a function of the index. For example A[I+1];
1019        unsigned NumOperands = Gep->getNumOperands();
1020        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
1021        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
1022
1023        // Create the new GEP with the new induction variable.
1024        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1025        Gep2->setOperand(NumOperands - 1, LastIndex);
1026        Ptr = Builder.Insert(Gep2);
1027        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
1028        Value *Val = getVectorValue(SI->getValueOperand());
1029        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
1030        break;
1031      }
1032      case Instruction::Load: {
1033        // Attempt to issue a wide load.
1034        LoadInst *LI = dyn_cast<LoadInst>(Inst);
1035        Type *RetTy = VectorType::get(LI->getType(), VF);
1036        Value *Ptr = LI->getPointerOperand();
1037        unsigned Alignment = LI->getAlignment();
1038        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1039
1040        // If we don't have a gep, or that the pointer is loop invariant,
1041        // scalarize the load.
1042        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
1043          scalarizeInstruction(Inst);
1044          break;
1045        }
1046
1047        // The last index does not have to be the induction. It can be
1048        // consecutive and be a function of the index. For example A[I+1];
1049        unsigned NumOperands = Gep->getNumOperands();
1050        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
1051        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
1052
1053        // Create the new GEP with the new induction variable.
1054        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1055        Gep2->setOperand(NumOperands - 1, LastIndex);
1056        Ptr = Builder.Insert(Gep2);
1057        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
1058        LI = Builder.CreateLoad(Ptr);
1059        LI->setAlignment(Alignment);
1060        // Use this vector value for all users of the load.
1061        WidenMap[Inst] = LI;
1062        break;
1063      }
1064      case Instruction::ZExt:
1065      case Instruction::SExt:
1066      case Instruction::FPToUI:
1067      case Instruction::FPToSI:
1068      case Instruction::FPExt:
1069      case Instruction::PtrToInt:
1070      case Instruction::IntToPtr:
1071      case Instruction::SIToFP:
1072      case Instruction::UIToFP:
1073      case Instruction::Trunc:
1074      case Instruction::FPTrunc:
1075      case Instruction::BitCast: {
1076        /// Vectorize bitcasts.
1077        CastInst *CI = dyn_cast<CastInst>(Inst);
1078        Value *A = getVectorValue(Inst->getOperand(0));
1079        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
1080        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
1081        break;
1082      }
1083
1084      default:
1085        /// All other instructions are unsupported. Scalarize them.
1086        scalarizeInstruction(Inst);
1087        break;
1088    }// end of switch.
1089  }// end of for_each instr.
1090
1091  // At this point every instruction in the original loop is widended to
1092  // a vector form. We are almost done. Now, we need to fix the PHI nodes
1093  // that we vectorized. The PHI nodes are currently empty because we did
1094  // not want to introduce cycles. Notice that the remaining PHI nodes
1095  // that we need to fix are reduction variables.
1096
1097  // Create the 'reduced' values for each of the induction vars.
1098  // The reduced values are the vector values that we scalarize and combine
1099  // after the loop is finished.
1100  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
1101       it != e; ++it) {
1102    PHINode *RdxPhi = *it;
1103    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
1104    assert(RdxPhi && "Unable to recover vectorized PHI");
1105
1106    // Find the reduction variable descriptor.
1107    assert(Legal->getReductionVars()->count(RdxPhi) &&
1108           "Unable to find the reduction variable");
1109    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
1110      (*Legal->getReductionVars())[RdxPhi];
1111
1112    // We need to generate a reduction vector from the incoming scalar.
1113    // To do so, we need to generate the 'identity' vector and overide
1114    // one of the elements with the incoming scalar reduction. We need
1115    // to do it in the vector-loop preheader.
1116    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
1117
1118    // This is the vector-clone of the value that leaves the loop.
1119    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
1120    Type *VecTy = VectorExit->getType();
1121
1122    // Find the reduction identity variable. Zero for addition, or, xor,
1123    // one for multiplication, -1 for And.
1124    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
1125                                          VecTy->getScalarType());
1126
1127    // This vector is the Identity vector where the first element is the
1128    // incoming scalar reduction.
1129    Value *VectorStart = Builder.CreateInsertElement(Identity,
1130                                                    RdxDesc.StartValue, Zero);
1131
1132
1133    // Fix the vector-loop phi.
1134    // We created the induction variable so we know that the
1135    // preheader is the first entry.
1136    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
1137
1138    // Reductions do not have to start at zero. They can start with
1139    // any loop invariant values.
1140    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
1141    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
1142    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
1143    VecRdxPhi->addIncoming(Val, LoopVectorBody);
1144
1145    // Before each round, move the insertion point right between
1146    // the PHIs and the values we are going to write.
1147    // This allows us to write both PHINodes and the extractelement
1148    // instructions.
1149    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
1150
1151    // This PHINode contains the vectorized reduction variable, or
1152    // the initial value vector, if we bypass the vector loop.
1153    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
1154    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
1155    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
1156
1157    // Extract the first scalar.
1158    Value *Scalar0 =
1159      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
1160    // Extract and reduce the remaining vector elements.
1161    for (unsigned i=1; i < VF; ++i) {
1162      Value *Scalar1 =
1163        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
1164      switch (RdxDesc.Kind) {
1165        case LoopVectorizationLegality::IntegerAdd:
1166          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
1167          break;
1168        case LoopVectorizationLegality::IntegerMult:
1169          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
1170          break;
1171        case LoopVectorizationLegality::IntegerOr:
1172          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
1173          break;
1174        case LoopVectorizationLegality::IntegerAnd:
1175          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
1176          break;
1177        case LoopVectorizationLegality::IntegerXor:
1178          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
1179          break;
1180        default:
1181          llvm_unreachable("Unknown reduction operation");
1182      }
1183    }
1184
1185    // Now, we need to fix the users of the reduction variable
1186    // inside and outside of the scalar remainder loop.
1187    // We know that the loop is in LCSSA form. We need to update the
1188    // PHI nodes in the exit blocks.
1189    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
1190         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
1191      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
1192      if (!LCSSAPhi) continue;
1193
1194      // All PHINodes need to have a single entry edge, or two if
1195      // we already fixed them.
1196      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
1197
1198      // We found our reduction value exit-PHI. Update it with the
1199      // incoming bypass edge.
1200      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
1201        // Add an edge coming from the bypass.
1202        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
1203        break;
1204      }
1205    }// end of the LCSSA phi scan.
1206
1207    // Fix the scalar loop reduction variable with the incoming reduction sum
1208    // from the vector body and from the backedge value.
1209    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
1210    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
1211    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
1212    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
1213  }// end of for each redux variable.
1214}
1215
1216void SingleBlockLoopVectorizer::updateAnalysis() {
1217  // The original basic block.
1218  SE->forgetLoop(OrigLoop);
1219
1220  // Update the dominator tree information.
1221  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
1222         "Entry does not dominate exit.");
1223
1224  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
1225  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
1226  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
1227  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
1228  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
1229  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
1230
1231  DEBUG(DT->verifyAnalysis());
1232}
1233
1234bool LoopVectorizationLegality::canVectorize() {
1235  if (!TheLoop->getLoopPreheader()) {
1236    assert(false && "No preheader!!");
1237    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
1238    return  false;
1239  }
1240
1241  // We can only vectorize single basic block loops.
1242  unsigned NumBlocks = TheLoop->getNumBlocks();
1243  if (NumBlocks != 1) {
1244    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
1245    return false;
1246  }
1247
1248  // We need to have a loop header.
1249  BasicBlock *BB = TheLoop->getHeader();
1250  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
1251
1252  // ScalarEvolution needs to be able to find the exit count.
1253  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
1254  if (ExitCount == SE->getCouldNotCompute()) {
1255    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
1256    return false;
1257  }
1258
1259  // Do not loop-vectorize loops with a tiny trip count.
1260  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
1261  if (TC > 0u && TC < TinyTripCountThreshold) {
1262    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
1263          "This loop is not worth vectorizing.\n");
1264    return false;
1265  }
1266
1267  // Go over each instruction and look at memory deps.
1268  if (!canVectorizeBlock(*BB)) {
1269    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
1270    return false;
1271  }
1272
1273  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
1274        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
1275        <<"!\n");
1276
1277  // Okay! We can vectorize. At this point we don't have any other mem analysis
1278  // which may limit our maximum vectorization factor, so just return true with
1279  // no restrictions.
1280  return true;
1281}
1282
1283bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
1284  // Scan the instructions in the block and look for hazards.
1285  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1286    Instruction *I = it;
1287
1288    PHINode *Phi = dyn_cast<PHINode>(I);
1289    if (Phi) {
1290      // This should not happen because the loop should be normalized.
1291      if (Phi->getNumIncomingValues() != 2) {
1292        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
1293        return false;
1294      }
1295      // We only look at integer phi nodes.
1296      if (!Phi->getType()->isIntegerTy()) {
1297        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
1298        return false;
1299      }
1300
1301      if (isInductionVariable(Phi)) {
1302        if (Induction) {
1303          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
1304          return false;
1305        }
1306        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
1307        Induction = Phi;
1308        continue;
1309      }
1310      if (AddReductionVar(Phi, IntegerAdd)) {
1311        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
1312        continue;
1313      }
1314      if (AddReductionVar(Phi, IntegerMult)) {
1315        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
1316        continue;
1317      }
1318      if (AddReductionVar(Phi, IntegerOr)) {
1319        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
1320        continue;
1321      }
1322      if (AddReductionVar(Phi, IntegerAnd)) {
1323        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
1324        continue;
1325      }
1326      if (AddReductionVar(Phi, IntegerXor)) {
1327        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
1328        continue;
1329      }
1330
1331      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
1332      return false;
1333    }// end of PHI handling
1334
1335    // We still don't handle functions.
1336    CallInst *CI = dyn_cast<CallInst>(I);
1337    if (CI) {
1338      DEBUG(dbgs() << "LV: Found a call site.\n");
1339      return false;
1340    }
1341
1342    // We do not re-vectorize vectors.
1343    if (!VectorType::isValidElementType(I->getType()) &&
1344        !I->getType()->isVoidTy()) {
1345      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
1346      return false;
1347    }
1348
1349    // Reduction instructions are allowed to have exit users.
1350    // All other instructions must not have external users.
1351    if (!AllowedExit.count(I))
1352      //Check that all of the users of the loop are inside the BB.
1353      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
1354           it != e; ++it) {
1355        Instruction *U = cast<Instruction>(*it);
1356        // This user may be a reduction exit value.
1357        BasicBlock *Parent = U->getParent();
1358        if (Parent != &BB) {
1359          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
1360          return false;
1361        }
1362    }
1363  } // next instr.
1364
1365  if (!Induction) {
1366      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
1367      return false;
1368  }
1369
1370  // Don't vectorize if the memory dependencies do not allow vectorization.
1371  if (!canVectorizeMemory(BB))
1372    return false;
1373
1374  // We now know that the loop is vectorizable!
1375  // Collect variables that will remain uniform after vectorization.
1376  std::vector<Value*> Worklist;
1377
1378  // Start with the conditional branch and walk up the block.
1379  Worklist.push_back(BB.getTerminator()->getOperand(0));
1380
1381  while (Worklist.size()) {
1382    Instruction *I = dyn_cast<Instruction>(Worklist.back());
1383    Worklist.pop_back();
1384    // Look at instructions inside this block.
1385    if (!I) continue;
1386    if (I->getParent() != &BB) continue;
1387
1388    // Stop when reaching PHI nodes.
1389    if (isa<PHINode>(I)) {
1390      assert(I == Induction && "Found a uniform PHI that is not the induction");
1391      break;
1392    }
1393
1394    // This is a known uniform.
1395    Uniforms.insert(I);
1396
1397    // Insert all operands.
1398    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
1399      Worklist.push_back(I->getOperand(i));
1400    }
1401  }
1402
1403  return true;
1404}
1405
1406bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
1407  typedef SmallVector<Value*, 16> ValueVector;
1408  typedef SmallPtrSet<Value*, 16> ValueSet;
1409  // Holds the Load and Store *instructions*.
1410  ValueVector Loads;
1411  ValueVector Stores;
1412  PtrRtCheck.Pointers.clear();
1413  PtrRtCheck.Need = false;
1414
1415  // Scan the BB and collect legal loads and stores.
1416  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1417    Instruction *I = it;
1418
1419    // If this is a load, save it. If this instruction can read from memory
1420    // but is not a load, then we quit. Notice that we don't handle function
1421    // calls that read or write.
1422    if (I->mayReadFromMemory()) {
1423      LoadInst *Ld = dyn_cast<LoadInst>(I);
1424      if (!Ld) return false;
1425      if (!Ld->isSimple()) {
1426        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
1427        return false;
1428      }
1429      Loads.push_back(Ld);
1430      continue;
1431    }
1432
1433    // Save store instructions. Abort if other instructions write to memory.
1434    if (I->mayWriteToMemory()) {
1435      StoreInst *St = dyn_cast<StoreInst>(I);
1436      if (!St) return false;
1437      if (!St->isSimple()) {
1438        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
1439        return false;
1440      }
1441      Stores.push_back(St);
1442    }
1443  } // next instr.
1444
1445  // Now we have two lists that hold the loads and the stores.
1446  // Next, we find the pointers that they use.
1447
1448  // Check if we see any stores. If there are no stores, then we don't
1449  // care if the pointers are *restrict*.
1450  if (!Stores.size()) {
1451        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
1452        return true;
1453  }
1454
1455  // Holds the read and read-write *pointers* that we find.
1456  ValueVector Reads;
1457  ValueVector ReadWrites;
1458
1459  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1460  // multiple times on the same object. If the ptr is accessed twice, once
1461  // for read and once for write, it will only appear once (on the write
1462  // list). This is okay, since we are going to check for conflicts between
1463  // writes and between reads and writes, but not between reads and reads.
1464  ValueSet Seen;
1465
1466  ValueVector::iterator I, IE;
1467  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
1468    StoreInst *ST = dyn_cast<StoreInst>(*I);
1469    assert(ST && "Bad StoreInst");
1470    Value* Ptr = ST->getPointerOperand();
1471
1472    if (isUniform(Ptr)) {
1473      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
1474      return false;
1475    }
1476
1477    // If we did *not* see this pointer before, insert it to
1478    // the read-write list. At this phase it is only a 'write' list.
1479    if (Seen.insert(Ptr))
1480      ReadWrites.push_back(Ptr);
1481  }
1482
1483  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
1484    LoadInst *LD = dyn_cast<LoadInst>(*I);
1485    assert(LD && "Bad LoadInst");
1486    Value* Ptr = LD->getPointerOperand();
1487    // If we did *not* see this pointer before, insert it to the
1488    // read list. If we *did* see it before, then it is already in
1489    // the read-write list. This allows us to vectorize expressions
1490    // such as A[i] += x;  Because the address of A[i] is a read-write
1491    // pointer. This only works if the index of A[i] is consecutive.
1492    // If the address of i is unknown (for example A[B[i]]) then we may
1493    // read a few words, modify, and write a few words, and some of the
1494    // words may be written to the same address.
1495    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
1496      Reads.push_back(Ptr);
1497  }
1498
1499  // If we write (or read-write) to a single destination and there are no
1500  // other reads in this loop then is it safe to vectorize.
1501  if (ReadWrites.size() == 1 && Reads.size() == 0) {
1502    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
1503    return true;
1504  }
1505
1506  // Find pointers with computable bounds. We are going to use this information
1507  // to place a runtime bound check.
1508  bool RT = true;
1509  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
1510    if (hasComputableBounds(*I)) {
1511      PtrRtCheck.Pointers.push_back(*I);
1512      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
1513    } else {
1514      RT = false;
1515      break;
1516    }
1517  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
1518    if (hasComputableBounds(*I)) {
1519      PtrRtCheck.Pointers.push_back(*I);
1520      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
1521    } else {
1522      RT = false;
1523      break;
1524    }
1525
1526  // Check that we did not collect too many pointers or found a
1527  // unsizeable pointer.
1528  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
1529    PtrRtCheck.Pointers.clear();
1530    RT = false;
1531  }
1532
1533  PtrRtCheck.Need = RT;
1534
1535  if (RT) {
1536    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
1537  }
1538
1539  // Now that the pointers are in two lists (Reads and ReadWrites), we
1540  // can check that there are no conflicts between each of the writes and
1541  // between the writes to the reads.
1542  ValueSet WriteObjects;
1543  ValueVector TempObjects;
1544
1545  // Check that the read-writes do not conflict with other read-write
1546  // pointers.
1547  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
1548    GetUnderlyingObjects(*I, TempObjects, DL);
1549    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1550         it != e; ++it) {
1551      if (!isIdentifiedObject(*it)) {
1552        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
1553        return RT;
1554      }
1555      if (!WriteObjects.insert(*it)) {
1556        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
1557              << **it <<"\n");
1558        return RT;
1559      }
1560    }
1561    TempObjects.clear();
1562  }
1563
1564  /// Check that the reads don't conflict with the read-writes.
1565  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
1566    GetUnderlyingObjects(*I, TempObjects, DL);
1567    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1568         it != e; ++it) {
1569      if (!isIdentifiedObject(*it)) {
1570        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
1571        return RT;
1572      }
1573      if (WriteObjects.count(*it)) {
1574        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
1575              << **it <<"\n");
1576        return RT;
1577      }
1578    }
1579    TempObjects.clear();
1580  }
1581
1582  // It is safe to vectorize and we don't need any runtime checks.
1583  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
1584  PtrRtCheck.Pointers.clear();
1585  PtrRtCheck.Need = false;
1586  return true;
1587}
1588
1589bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
1590                                                ReductionKind Kind) {
1591  if (Phi->getNumIncomingValues() != 2)
1592    return false;
1593
1594  // Find the possible incoming reduction variable.
1595  BasicBlock *BB = Phi->getParent();
1596  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
1597  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
1598  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
1599
1600  // ExitInstruction is the single value which is used outside the loop.
1601  // We only allow for a single reduction value to be used outside the loop.
1602  // This includes users of the reduction, variables (which form a cycle
1603  // which ends in the phi node).
1604  Instruction *ExitInstruction = 0;
1605
1606  // Iter is our iterator. We start with the PHI node and scan for all of the
1607  // users of this instruction. All users must be instructions which can be
1608  // used as reduction variables (such as ADD). We may have a single
1609  // out-of-block user. They cycle must end with the original PHI.
1610  // Also, we can't have multiple block-local users.
1611  Instruction *Iter = Phi;
1612  while (true) {
1613    // Any reduction instr must be of one of the allowed kinds.
1614    if (!isReductionInstr(Iter, Kind))
1615      return false;
1616
1617    // Did we found a user inside this block ?
1618    bool FoundInBlockUser = false;
1619    // Did we reach the initial PHI node ?
1620    bool FoundStartPHI = false;
1621
1622    // If the instruction has no users then this is a broken
1623    // chain and can't be a reduction variable.
1624    if (Iter->use_empty())
1625      return false;
1626
1627    // For each of the *users* of iter.
1628    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
1629         it != e; ++it) {
1630      Instruction *U = cast<Instruction>(*it);
1631      // We already know that the PHI is a user.
1632      if (U == Phi) {
1633        FoundStartPHI = true;
1634        continue;
1635      }
1636      // Check if we found the exit user.
1637      BasicBlock *Parent = U->getParent();
1638      if (Parent != BB) {
1639        // We must have a single exit instruction.
1640        if (ExitInstruction != 0)
1641          return false;
1642        ExitInstruction = Iter;
1643      }
1644      // We can't have multiple inside users.
1645      if (FoundInBlockUser)
1646        return false;
1647      FoundInBlockUser = true;
1648      Iter = U;
1649    }
1650
1651    // We found a reduction var if we have reached the original
1652    // phi node and we only have a single instruction with out-of-loop
1653    // users.
1654   if (FoundStartPHI && ExitInstruction) {
1655     // This instruction is allowed to have out-of-loop users.
1656     AllowedExit.insert(ExitInstruction);
1657
1658     // Save the description of this reduction variable.
1659     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
1660     Reductions[Phi] = RD;
1661     return true;
1662   }
1663  }
1664}
1665
1666bool
1667LoopVectorizationLegality::isReductionInstr(Instruction *I,
1668                                            ReductionKind Kind) {
1669    switch (I->getOpcode()) {
1670    default:
1671      return false;
1672    case Instruction::PHI:
1673      // possibly.
1674      return true;
1675    case Instruction::Add:
1676    case Instruction::Sub:
1677      return Kind == IntegerAdd;
1678    case Instruction::Mul:
1679    case Instruction::UDiv:
1680    case Instruction::SDiv:
1681      return Kind == IntegerMult;
1682    case Instruction::And:
1683      return Kind == IntegerAnd;
1684    case Instruction::Or:
1685      return Kind == IntegerOr;
1686    case Instruction::Xor:
1687      return Kind == IntegerXor;
1688    }
1689}
1690
1691bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
1692  // Check that the PHI is consecutive and starts at zero.
1693  const SCEV *PhiScev = SE->getSCEV(Phi);
1694  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1695  if (!AR) {
1696    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
1697    return false;
1698  }
1699  const SCEV *Step = AR->getStepRecurrence(*SE);
1700
1701  if (!Step->isOne()) {
1702    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
1703    return false;
1704  }
1705  return true;
1706}
1707
1708bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
1709  const SCEV *PhiScev = SE->getSCEV(Ptr);
1710  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1711  if (!AR)
1712    return false;
1713
1714  return AR->isAffine();
1715}
1716
1717unsigned
1718LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
1719  if (!VTTI) {
1720    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
1721    return 1;
1722  }
1723
1724  float Cost = expectedCost(1);
1725  unsigned Width = 1;
1726  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
1727  for (unsigned i=2; i <= VF; i*=2) {
1728    // Notice that the vector loop needs to be executed less times, so
1729    // we need to divide the cost of the vector loops by the width of
1730    // the vector elements.
1731    float VectorCost = expectedCost(i) / (float)i;
1732    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
1733          (int)VectorCost << ".\n");
1734    if (VectorCost < Cost) {
1735      Cost = VectorCost;
1736      Width = i;
1737    }
1738  }
1739
1740  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
1741  return Width;
1742}
1743
1744unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
1745  // We can only estimate the cost of single basic block loops.
1746  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
1747
1748  BasicBlock *BB = TheLoop->getHeader();
1749  unsigned Cost = 0;
1750
1751  // For each instruction in the old loop.
1752  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
1753    Instruction *Inst = it;
1754    unsigned C = getInstructionCost(Inst, VF);
1755    Cost += C;
1756    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
1757          " For instruction: "<< *Inst << "\n");
1758  }
1759
1760  return Cost;
1761}
1762
1763unsigned
1764LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
1765  assert(VTTI && "Invalid vector target transformation info");
1766
1767  // If we know that this instruction will remain uniform, check the cost of
1768  // the scalar version.
1769  if (Legal->isUniformAfterVectorization(I))
1770    VF = 1;
1771
1772  Type *RetTy = I->getType();
1773  Type *VectorTy = ToVectorTy(RetTy, VF);
1774
1775
1776  // TODO: We need to estimate the cost of intrinsic calls.
1777  switch (I->getOpcode()) {
1778    case Instruction::GetElementPtr:
1779      // We mark this instruction as zero-cost because scalar GEPs are usually
1780      // lowered to the intruction addressing mode. At the moment we don't
1781      // generate vector geps.
1782      return 0;
1783    case Instruction::Br: {
1784      return VTTI->getCFInstrCost(I->getOpcode());
1785    }
1786    case Instruction::PHI:
1787      return 0;
1788    case Instruction::Add:
1789    case Instruction::FAdd:
1790    case Instruction::Sub:
1791    case Instruction::FSub:
1792    case Instruction::Mul:
1793    case Instruction::FMul:
1794    case Instruction::UDiv:
1795    case Instruction::SDiv:
1796    case Instruction::FDiv:
1797    case Instruction::URem:
1798    case Instruction::SRem:
1799    case Instruction::FRem:
1800    case Instruction::Shl:
1801    case Instruction::LShr:
1802    case Instruction::AShr:
1803    case Instruction::And:
1804    case Instruction::Or:
1805    case Instruction::Xor: {
1806      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
1807    }
1808    case Instruction::Select: {
1809      SelectInst *SI = cast<SelectInst>(I);
1810      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
1811      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
1812      Type *CondTy = SI->getCondition()->getType();
1813      if (ScalarCond)
1814        CondTy = VectorType::get(CondTy, VF);
1815
1816      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
1817    }
1818    case Instruction::ICmp:
1819    case Instruction::FCmp: {
1820      Type *ValTy = I->getOperand(0)->getType();
1821      VectorTy = ToVectorTy(ValTy, VF);
1822      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
1823    }
1824    case Instruction::Store: {
1825      StoreInst *SI = cast<StoreInst>(I);
1826      Type *ValTy = SI->getValueOperand()->getType();
1827      VectorTy = ToVectorTy(ValTy, VF);
1828
1829      if (VF == 1)
1830        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
1831                              SI->getAlignment(), SI->getPointerAddressSpace());
1832
1833      // Scalarized stores.
1834      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
1835        unsigned Cost = 0;
1836        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1837                                              ValTy);
1838        // The cost of extracting from the value vector.
1839        Cost += VF * (ExtCost);
1840        // The cost of the scalar stores.
1841        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1842                                           ValTy->getScalarType(),
1843                                           SI->getAlignment(),
1844                                           SI->getPointerAddressSpace());
1845        return Cost;
1846      }
1847
1848      // Wide stores.
1849      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
1850                                   SI->getPointerAddressSpace());
1851    }
1852    case Instruction::Load: {
1853      LoadInst *LI = cast<LoadInst>(I);
1854
1855      if (VF == 1)
1856        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
1857                                     LI->getAlignment(),
1858                                     LI->getPointerAddressSpace());
1859
1860      // Scalarized loads.
1861      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
1862        unsigned Cost = 0;
1863        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
1864        // The cost of inserting the loaded value into the result vector.
1865        Cost += VF * (InCost);
1866        // The cost of the scalar stores.
1867        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1868                                           RetTy->getScalarType(),
1869                                           LI->getAlignment(),
1870                                           LI->getPointerAddressSpace());
1871        return Cost;
1872      }
1873
1874      // Wide loads.
1875      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
1876                                   LI->getPointerAddressSpace());
1877    }
1878    case Instruction::ZExt:
1879    case Instruction::SExt:
1880    case Instruction::FPToUI:
1881    case Instruction::FPToSI:
1882    case Instruction::FPExt:
1883    case Instruction::PtrToInt:
1884    case Instruction::IntToPtr:
1885    case Instruction::SIToFP:
1886    case Instruction::UIToFP:
1887    case Instruction::Trunc:
1888    case Instruction::FPTrunc:
1889    case Instruction::BitCast: {
1890      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
1891      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
1892    }
1893    default: {
1894      // We are scalarizing the instruction. Return the cost of the scalar
1895      // instruction, plus the cost of insert and extract into vector
1896      // elements, times the vector width.
1897      unsigned Cost = 0;
1898
1899      bool IsVoid = RetTy->isVoidTy();
1900
1901      unsigned InsCost = (IsVoid ? 0 :
1902                          VTTI->getInstrCost(Instruction::InsertElement,
1903                                             VectorTy));
1904
1905      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1906                                            VectorTy);
1907
1908      // The cost of inserting the results plus extracting each one of the
1909      // operands.
1910      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
1911
1912      // The cost of executing VF copies of the scalar instruction.
1913      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
1914      return Cost;
1915    }
1916  }// end of switch.
1917}
1918
1919Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
1920  if (Scalar->isVoidTy() || VF == 1)
1921    return Scalar;
1922  return VectorType::get(Scalar, VF);
1923}
1924
1925} // namespace
1926
1927char LoopVectorize::ID = 0;
1928static const char lv_name[] = "Loop Vectorization";
1929INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
1930INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1931INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1932INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1933INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
1934
1935namespace llvm {
1936  Pass *createLoopVectorizePass() {
1937    return new LoopVectorize();
1938  }
1939}
1940
1941