LoopVectorize.cpp revision 462d1ca42831df53a3c6435d247776f0b85bd594
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11// and generates target-independent LLVM-IR. Legalization of the IR is done
12// in the codegen. However, the vectorizes uses (will use) the codegen
13// interfaces to generate IR that is likely to result in an optimal binary.
14//
15// The loop vectorizer combines consecutive loop iteration into a single
16// 'wide' iteration. After this transformation the index is incremented
17// by the SIMD vector width, and not by one.
18//
19// This pass has three parts:
20// 1. The main loop pass that drives the different parts.
21// 2. LoopVectorizationLegality - A unit that checks for the legality
22//    of the vectorization.
23// 3. SingleBlockLoopVectorizer - A unit that performs the actual
24//    widening of instructions.
25// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26//    of vectorization. It decides on the optimal vector width, which
27//    can be one, if vectorization is not profitable.
28//===----------------------------------------------------------------------===//
29//
30// The reduction-variable vectorization is based on the paper:
31//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
32//
33// Variable uniformity checks are inspired by:
34// Karrenberg, R. and Hack, S. Whole Function Vectorization.
35//
36// Other ideas/concepts are from:
37//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
38//
39//===----------------------------------------------------------------------===//
40#define LV_NAME "loop-vectorize"
41#define DEBUG_TYPE LV_NAME
42#include "llvm/Constants.h"
43#include "llvm/DerivedTypes.h"
44#include "llvm/Instructions.h"
45#include "llvm/LLVMContext.h"
46#include "llvm/Pass.h"
47#include "llvm/Analysis/LoopPass.h"
48#include "llvm/Value.h"
49#include "llvm/Function.h"
50#include "llvm/Analysis/Verifier.h"
51#include "llvm/Module.h"
52#include "llvm/Type.h"
53#include "llvm/ADT/SmallVector.h"
54#include "llvm/ADT/StringExtras.h"
55#include "llvm/Analysis/AliasAnalysis.h"
56#include "llvm/Analysis/AliasSetTracker.h"
57#include "llvm/Analysis/ScalarEvolution.h"
58#include "llvm/Analysis/Dominators.h"
59#include "llvm/Analysis/ScalarEvolutionExpressions.h"
60#include "llvm/Analysis/ScalarEvolutionExpander.h"
61#include "llvm/Analysis/LoopInfo.h"
62#include "llvm/Analysis/ValueTracking.h"
63#include "llvm/Transforms/Scalar.h"
64#include "llvm/Transforms/Utils/BasicBlockUtils.h"
65#include "llvm/TargetTransformInfo.h"
66#include "llvm/Support/CommandLine.h"
67#include "llvm/Support/Debug.h"
68#include "llvm/Support/raw_ostream.h"
69#include "llvm/DataLayout.h"
70#include "llvm/Transforms/Utils/Local.h"
71#include <algorithm>
72using namespace llvm;
73
74static cl::opt<unsigned>
75VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
76          cl::desc("Set the default vectorization width. Zero is autoselect."));
77
78namespace {
79
80// Forward declarations.
81class LoopVectorizationLegality;
82class LoopVectorizationCostModel;
83
84/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
85/// block to a specified vectorization factor (VF).
86/// This class performs the widening of scalars into vectors, or multiple
87/// scalars. This class also implements the following features:
88/// * It inserts an epilogue loop for handling loops that don't have iteration
89///   counts that are known to be a multiple of the vectorization factor.
90/// * It handles the code generation for reduction variables.
91/// * Scalarization (implementation using scalars) of un-vectorizable
92///   instructions.
93/// SingleBlockLoopVectorizer does not perform any vectorization-legality
94/// checks, and relies on the caller to check for the different legality
95/// aspects. The SingleBlockLoopVectorizer relies on the
96/// LoopVectorizationLegality class to provide information about the induction
97/// and reduction variables that were found to a given vectorization factor.
98class SingleBlockLoopVectorizer {
99public:
100  /// Ctor.
101  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
102                            DominatorTree *dt, LPPassManager *Lpm,
103                            unsigned VecWidth):
104  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
105  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
106
107  // Perform the actual loop widening (vectorization).
108  void vectorize(LoopVectorizationLegality *Legal) {
109    ///Create a new empty loop. Unlink the old loop and connect the new one.
110    createEmptyLoop(Legal);
111    /// Widen each instruction in the old loop to a new one in the new loop.
112    /// Use the Legality module to find the induction and reduction variables.
113    vectorizeLoop(Legal);
114    // register the new loop.
115    updateAnalysis();
116 }
117
118private:
119  /// Create an empty loop, based on the loop ranges of the old loop.
120  void createEmptyLoop(LoopVectorizationLegality *Legal);
121  /// Copy and widen the instructions from the old loop.
122  void vectorizeLoop(LoopVectorizationLegality *Legal);
123  /// Insert the new loop to the loop hierarchy and pass manager.
124  void updateAnalysis();
125
126  /// This instruction is un-vectorizable. Implement it as a sequence
127  /// of scalars.
128  void scalarizeInstruction(Instruction *Instr);
129
130  /// Create a broadcast instruction. This method generates a broadcast
131  /// instruction (shuffle) for loop invariant values and for the induction
132  /// value. If this is the induction variable then we extend it to N, N+1, ...
133  /// this is needed because each iteration in the loop corresponds to a SIMD
134  /// element.
135  Value *getBroadcastInstrs(Value *V);
136
137  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
138  /// for each element in the vector. Starting from zero.
139  Value *getConsecutiveVector(Value* Val);
140
141  /// When we go over instructions in the basic block we rely on previous
142  /// values within the current basic block or on loop invariant values.
143  /// When we widen (vectorize) values we place them in the map. If the values
144  /// are not within the map, they have to be loop invariant, so we simply
145  /// broadcast them into a vector.
146  Value *getVectorValue(Value *V);
147
148  /// Get a uniform vector of constant integers. We use this to get
149  /// vectors of ones and zeros for the reduction code.
150  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
151
152  typedef DenseMap<Value*, Value*> ValueMap;
153
154  /// The original loop.
155  Loop *OrigLoop;
156  // Scev analysis to use.
157  ScalarEvolution *SE;
158  // Loop Info.
159  LoopInfo *LI;
160  // Dominator Tree.
161  DominatorTree *DT;
162  // Loop Pass Manager;
163  LPPassManager *LPM;
164  // The vectorization factor to use.
165  unsigned VF;
166
167  // The builder that we use
168  IRBuilder<> Builder;
169
170  // --- Vectorization state ---
171
172  /// The vector-loop preheader.
173  BasicBlock *LoopVectorPreHeader;
174  /// The scalar-loop preheader.
175  BasicBlock *LoopScalarPreHeader;
176  /// Middle Block between the vector and the scalar.
177  BasicBlock *LoopMiddleBlock;
178  ///The ExitBlock of the scalar loop.
179  BasicBlock *LoopExitBlock;
180  ///The vector loop body.
181  BasicBlock *LoopVectorBody;
182  ///The scalar loop body.
183  BasicBlock *LoopScalarBody;
184  ///The first bypass block.
185  BasicBlock *LoopBypassBlock;
186
187  /// The new Induction variable which was added to the new block.
188  PHINode *Induction;
189  /// The induction variable of the old basic block.
190  PHINode *OldInduction;
191  // Maps scalars to widened vectors.
192  ValueMap WidenMap;
193};
194
195/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
196/// to what vectorization factor.
197/// This class does not look at the profitability of vectorization, only the
198/// legality. This class has two main kinds of checks:
199/// * Memory checks - The code in canVectorizeMemory checks if vectorization
200///   will change the order of memory accesses in a way that will change the
201///   correctness of the program.
202/// * Scalars checks - The code in canVectorizeBlock checks for a number
203///   of different conditions, such as the availability of a single induction
204///   variable, that all types are supported and vectorize-able, etc.
205/// This code reflects the capabilities of SingleBlockLoopVectorizer.
206/// This class is also used by SingleBlockLoopVectorizer for identifying
207/// induction variable and the different reduction variables.
208class LoopVectorizationLegality {
209public:
210  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
211  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
212
213  /// This represents the kinds of reductions that we support.
214  enum ReductionKind {
215    NoReduction = -1, /// Not a reduction.
216    IntegerAdd  = 0,  /// Sum of numbers.
217    IntegerMult = 1,  /// Product of numbers.
218    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
219    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
220    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
221  };
222
223  /// This POD struct holds information about reduction variables.
224  struct ReductionDescriptor {
225    // Default C'tor
226    ReductionDescriptor():
227    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
228
229    // C'tor.
230    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
231    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
232
233    // The starting value of the reduction.
234    // It does not have to be zero!
235    Value *StartValue;
236    // The instruction who's value is used outside the loop.
237    Instruction *LoopExitInstr;
238    // The kind of the reduction.
239    ReductionKind Kind;
240  };
241
242  /// ReductionList contains the reduction descriptors for all
243  /// of the reductions that were found in the loop.
244  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
245
246  /// Returns true if it is legal to vectorize this loop.
247  /// This does not mean that it is profitable to vectorize this
248  /// loop, only that it is legal to do so.
249  bool canVectorize();
250
251  /// Returns the Induction variable.
252  PHINode *getInduction() {return Induction;}
253
254  /// Returns the reduction variables found in the loop.
255  ReductionList *getReductionVars() { return &Reductions; }
256
257  /// Check if the pointer returned by this GEP is consecutive
258  /// when the index is vectorized. This happens when the last
259  /// index of the GEP is consecutive, like the induction variable.
260  /// This check allows us to vectorize A[idx] into a wide load/store.
261  bool isConsecutiveGep(Value *Ptr);
262
263  /// Returns true if this instruction will remain scalar after vectorization.
264  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
265
266private:
267  /// Check if a single basic block loop is vectorizable.
268  /// At this point we know that this is a loop with a constant trip count
269  /// and we only need to check individual instructions.
270  bool canVectorizeBlock(BasicBlock &BB);
271
272  /// When we vectorize loops we may change the order in which
273  /// we read and write from memory. This method checks if it is
274  /// legal to vectorize the code, considering only memory constrains.
275  /// Returns true if BB is vectorizable
276  bool canVectorizeMemory(BasicBlock &BB);
277
278  /// Returns True, if 'Phi' is the kind of reduction variable for type
279  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
280  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
281  /// Returns true if the instruction I can be a reduction variable of type
282  /// 'Kind'.
283  bool isReductionInstr(Instruction *I, ReductionKind Kind);
284  /// Returns True, if 'Phi' is an induction variable.
285  bool isInductionVariable(PHINode *Phi);
286
287  /// The loop that we evaluate.
288  Loop *TheLoop;
289  /// Scev analysis.
290  ScalarEvolution *SE;
291  /// DataLayout analysis.
292  DataLayout *DL;
293
294  //  ---  vectorization state --- //
295
296  /// Holds the induction variable.
297  PHINode *Induction;
298  /// Holds the reduction variables.
299  ReductionList Reductions;
300  /// Allowed outside users. This holds the reduction
301  /// vars which can be accessed from outside the loop.
302  SmallPtrSet<Value*, 4> AllowedExit;
303  /// This set holds the variables which are known to be uniform after
304  /// vectorization.
305  SmallPtrSet<Instruction*, 4> Uniforms;
306};
307
308/// LoopVectorizationCostModel - estimates the expected speedups due to
309/// vectorization.
310/// In many cases vectorization is not profitable. This can happen because
311/// of a number of reasons. In this class we mainly attempt to predict
312/// the expected speedup/slowdowns due to the supported instruction set.
313/// We use the VectorTargetTransformInfo to query the different backends
314/// for the cost of different operations.
315class LoopVectorizationCostModel {
316public:
317  /// C'tor.
318  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
319                             LoopVectorizationLegality *Leg,
320                             const VectorTargetTransformInfo *Vtti):
321  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
322
323  /// Returns the most profitable vectorization factor for the loop that is
324  /// smaller or equal to the VF argument. This method checks every power
325  /// of two up to VF.
326  unsigned findBestVectorizationFactor(unsigned VF = 8);
327
328private:
329  /// Returns the expected execution cost. The unit of the cost does
330  /// not matter because we use the 'cost' units to compare different
331  /// vector widths. The cost that is returned is *not* normalized by
332  /// the factor width.
333  unsigned expectedCost(unsigned VF);
334
335  /// Returns the execution time cost of an instruction for a given vector
336  /// width. Vector width of one means scalar.
337  unsigned getInstructionCost(Instruction *I, unsigned VF);
338
339  /// A helper function for converting Scalar types to vector types.
340  /// If the incoming type is void, we return void. If the VF is 1, we return
341  /// the scalar type.
342  static Type* ToVectorTy(Type *Scalar, unsigned VF);
343
344  /// The loop that we evaluate.
345  Loop *TheLoop;
346  /// Scev analysis.
347  ScalarEvolution *SE;
348
349  /// Vectorization legality.
350  LoopVectorizationLegality *Legal;
351  /// Vector target information.
352  const VectorTargetTransformInfo *VTTI;
353};
354
355struct LoopVectorize : public LoopPass {
356  static char ID; // Pass identification, replacement for typeid
357
358  LoopVectorize() : LoopPass(ID) {
359    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
360  }
361
362  ScalarEvolution *SE;
363  DataLayout *DL;
364  LoopInfo *LI;
365  TargetTransformInfo *TTI;
366  DominatorTree *DT;
367
368  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
369    // We only vectorize innermost loops.
370    if (!L->empty())
371      return false;
372
373    SE = &getAnalysis<ScalarEvolution>();
374    DL = getAnalysisIfAvailable<DataLayout>();
375    LI = &getAnalysis<LoopInfo>();
376    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
377    DT = &getAnalysis<DominatorTree>();
378
379    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
380          L->getHeader()->getParent()->getName() << "\"\n");
381
382    // Check if it is legal to vectorize the loop.
383    LoopVectorizationLegality LVL(L, SE, DL);
384    if (!LVL.canVectorize()) {
385      DEBUG(dbgs() << "LV: Not vectorizing.\n");
386      return false;
387    }
388
389    // Select the preffered vectorization factor.
390    unsigned VF = 1;
391    if (VectorizationFactor == 0) {
392      const VectorTargetTransformInfo *VTTI = 0;
393      if (TTI)
394        VTTI = TTI->getVectorTargetTransformInfo();
395      // Use the cost model.
396      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
397      VF = CM.findBestVectorizationFactor();
398
399      if (VF == 1) {
400        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
401        return false;
402      }
403
404    } else {
405      // Use the user command flag.
406      VF = VectorizationFactor;
407    }
408
409    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
410          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
411          "\n");
412
413    // If we decided that it is *legal* to vectorizer the loop then do it.
414    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
415    LB.vectorize(&LVL);
416
417    DEBUG(verifyFunction(*L->getHeader()->getParent()));
418    return true;
419  }
420
421  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
422    LoopPass::getAnalysisUsage(AU);
423    AU.addRequiredID(LoopSimplifyID);
424    AU.addRequiredID(LCSSAID);
425    AU.addRequired<LoopInfo>();
426    AU.addRequired<ScalarEvolution>();
427    AU.addRequired<DominatorTree>();
428    AU.addPreserved<LoopInfo>();
429    AU.addPreserved<DominatorTree>();
430  }
431
432};
433
434Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
435  // Instructions that access the old induction variable
436  // actually want to get the new one.
437  if (V == OldInduction)
438    V = Induction;
439  // Create the types.
440  LLVMContext &C = V->getContext();
441  Type *VTy = VectorType::get(V->getType(), VF);
442  Type *I32 = IntegerType::getInt32Ty(C);
443  Constant *Zero = ConstantInt::get(I32, 0);
444  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
445  Value *UndefVal = UndefValue::get(VTy);
446  // Insert the value into a new vector.
447  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
448  // Broadcast the scalar into all locations in the vector.
449  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
450                                             "broadcast");
451  // We are accessing the induction variable. Make sure to promote the
452  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
453  if (V == Induction)
454    return getConsecutiveVector(Shuf);
455  return Shuf;
456}
457
458Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
459  assert(Val->getType()->isVectorTy() && "Must be a vector");
460  assert(Val->getType()->getScalarType()->isIntegerTy() &&
461         "Elem must be an integer");
462  // Create the types.
463  Type *ITy = Val->getType()->getScalarType();
464  VectorType *Ty = cast<VectorType>(Val->getType());
465  unsigned VLen = Ty->getNumElements();
466  SmallVector<Constant*, 8> Indices;
467
468  // Create a vector of consecutive numbers from zero to VF.
469  for (unsigned i = 0; i < VLen; ++i)
470    Indices.push_back(ConstantInt::get(ITy, i));
471
472  // Add the consecutive indices to the vector value.
473  Constant *Cv = ConstantVector::get(Indices);
474  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
475  return Builder.CreateAdd(Val, Cv, "induction");
476}
477
478bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
479  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
480  if (!Gep)
481    return false;
482
483  unsigned NumOperands = Gep->getNumOperands();
484  Value *LastIndex = Gep->getOperand(NumOperands - 1);
485
486  // Check that all of the gep indices are uniform except for the last.
487  for (unsigned i = 0; i < NumOperands - 1; ++i)
488    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
489      return false;
490
491  // We can emit wide load/stores only of the last index is the induction
492  // variable.
493  const SCEV *Last = SE->getSCEV(LastIndex);
494  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
495    const SCEV *Step = AR->getStepRecurrence(*SE);
496
497    // The memory is consecutive because the last index is consecutive
498    // and all other indices are loop invariant.
499    if (Step->isOne())
500      return true;
501  }
502
503  return false;
504}
505
506Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
507  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
508  // If we saved a vectorized copy of V, use it.
509  Value *&MapEntry = WidenMap[V];
510  if (MapEntry)
511    return MapEntry;
512
513  // Broadcast V and save the value for future uses.
514  Value *B = getBroadcastInstrs(V);
515  MapEntry = B;
516  return B;
517}
518
519Constant*
520SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
521  SmallVector<Constant*, 8> Indices;
522  // Create a vector of consecutive numbers from zero to VF.
523  for (unsigned i = 0; i < VF; ++i)
524    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
525
526  // Add the consecutive indices to the vector value.
527  return ConstantVector::get(Indices);
528}
529
530void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
531  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
532  // Holds vector parameters or scalars, in case of uniform vals.
533  SmallVector<Value*, 8> Params;
534
535  // Find all of the vectorized parameters.
536  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
537    Value *SrcOp = Instr->getOperand(op);
538
539    // If we are accessing the old induction variable, use the new one.
540    if (SrcOp == OldInduction) {
541      Params.push_back(getBroadcastInstrs(Induction));
542      continue;
543    }
544
545    // Try using previously calculated values.
546    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
547
548    // If the src is an instruction that appeared earlier in the basic block
549    // then it should already be vectorized.
550    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
551      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
552      // The parameter is a vector value from earlier.
553      Params.push_back(WidenMap[SrcInst]);
554    } else {
555      // The parameter is a scalar from outside the loop. Maybe even a constant.
556      Params.push_back(SrcOp);
557    }
558  }
559
560  assert(Params.size() == Instr->getNumOperands() &&
561         "Invalid number of operands");
562
563  // Does this instruction return a value ?
564  bool IsVoidRetTy = Instr->getType()->isVoidTy();
565  Value *VecResults = 0;
566
567  // If we have a return value, create an empty vector. We place the scalarized
568  // instructions in this vector.
569  if (!IsVoidRetTy)
570    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
571
572  // For each scalar that we create:
573  for (unsigned i = 0; i < VF; ++i) {
574    Instruction *Cloned = Instr->clone();
575    if (!IsVoidRetTy)
576      Cloned->setName(Instr->getName() + ".cloned");
577    // Replace the operands of the cloned instrucions with extracted scalars.
578    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
579      Value *Op = Params[op];
580      // Param is a vector. Need to extract the right lane.
581      if (Op->getType()->isVectorTy())
582        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
583      Cloned->setOperand(op, Op);
584    }
585
586    // Place the cloned scalar in the new loop.
587    Builder.Insert(Cloned);
588
589    // If the original scalar returns a value we need to place it in a vector
590    // so that future users will be able to use it.
591    if (!IsVoidRetTy)
592      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
593                                               Builder.getInt32(i));
594  }
595
596  if (!IsVoidRetTy)
597    WidenMap[Instr] = VecResults;
598}
599
600void
601SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
602  /*
603   In this function we generate a new loop. The new loop will contain
604   the vectorized instructions while the old loop will continue to run the
605   scalar remainder.
606
607    [ ] <-- vector loop bypass.
608  /  |
609 /   v
610|   [ ]     <-- vector pre header.
611|    |
612|    v
613|   [  ] \
614|   [  ]_|   <-- vector loop.
615|    |
616 \   v
617   >[ ]   <--- middle-block.
618  /  |
619 /   v
620|   [ ]     <--- new preheader.
621|    |
622|    v
623|   [ ] \
624|   [ ]_|   <-- old scalar loop to handle remainder.
625 \   |
626  \  v
627   >[ ]     <-- exit block.
628   ...
629   */
630
631  // This is the original scalar-loop preheader.
632  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
633  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
634  assert(ExitBlock && "Must have an exit block");
635
636  // The loop index does not have to start at Zero. It starts with this value.
637  OldInduction = Legal->getInduction();
638  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
639
640  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
641  assert(BypassBlock && "Invalid loop structure");
642
643  BasicBlock *VectorPH =
644      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
645  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
646                                                 "vector.body");
647
648  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
649                                                  "middle.block");
650  BasicBlock *ScalarPH =
651    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
652                                 "scalar.preheader");
653  // Find the induction variable.
654  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
655  assert(OldInduction && "We must have a single phi node.");
656  Type *IdxTy = OldInduction->getType();
657
658  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
659  // inside the loop.
660  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
661
662  // Generate the induction variable.
663  Induction = Builder.CreatePHI(IdxTy, 2, "index");
664  Constant *Step = ConstantInt::get(IdxTy, VF);
665
666  // Find the loop boundaries.
667  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
668  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
669
670  // Get the total trip count from the count by adding 1.
671  ExitCount = SE->getAddExpr(ExitCount,
672                             SE->getConstant(ExitCount->getType(), 1));
673
674  // Expand the trip count and place the new instructions in the preheader.
675  // Notice that the pre-header does not change, only the loop body.
676  SCEVExpander Exp(*SE, "induction");
677  Instruction *Loc = BypassBlock->getTerminator();
678
679  // We may need to extend the index in case there is a type mismatch.
680  // We know that the count starts at zero and does not overflow.
681  // We are using Zext because it should be less expensive.
682  if (ExitCount->getType() != Induction->getType())
683    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
684
685  // Count holds the overall loop count (N).
686  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
687
688  // Add the start index to the loop count to get the new end index.
689  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
690
691  // Now we need to generate the expression for N - (N % VF), which is
692  // the part that the vectorized body will execute.
693  Constant *CIVF = ConstantInt::get(IdxTy, VF);
694  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
695  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
696  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
697                                                     "end.idx.rnd.down", Loc);
698
699  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
700  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
701                               IdxEndRoundDown,
702                               StartIdx,
703                               "cmp.zero", Loc);
704  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
705  // Remove the old terminator.
706  Loc->eraseFromParent();
707
708  // Add a check in the middle block to see if we have completed
709  // all of the iterations in the first vector loop.
710  // If (N - N%VF) == N, then we *don't* need to run the remainder.
711  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
712                                IdxEndRoundDown, "cmp.n",
713                                MiddleBlock->getTerminator());
714
715  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
716  // Remove the old terminator.
717  MiddleBlock->getTerminator()->eraseFromParent();
718
719  // Create i+1 and fill the PHINode.
720  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
721  Induction->addIncoming(StartIdx, VectorPH);
722  Induction->addIncoming(NextIdx, VecBody);
723  // Create the compare.
724  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
725  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
726
727  // Now we have two terminators. Remove the old one from the block.
728  VecBody->getTerminator()->eraseFromParent();
729
730  // Fix the scalar body iteration count.
731  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
732  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
733
734  // Get ready to start creating new instructions into the vectorized body.
735  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
736
737  // Register the new loop.
738  Loop* Lp = new Loop();
739  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
740
741  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
742
743  Loop *ParentLoop = OrigLoop->getParentLoop();
744  if (ParentLoop) {
745    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
746    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
747    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
748  }
749
750  // Save the state.
751  LoopVectorPreHeader = VectorPH;
752  LoopScalarPreHeader = ScalarPH;
753  LoopMiddleBlock = MiddleBlock;
754  LoopExitBlock = ExitBlock;
755  LoopVectorBody = VecBody;
756  LoopScalarBody = OldBasicBlock;
757  LoopBypassBlock = BypassBlock;
758}
759
760/// This function returns the identity element (or neutral element) for
761/// the operation K.
762static unsigned
763getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
764  switch (K) {
765  case LoopVectorizationLegality::IntegerXor:
766  case LoopVectorizationLegality::IntegerAdd:
767  case LoopVectorizationLegality::IntegerOr:
768    // Adding, Xoring, Oring zero to a number does not change it.
769    return 0;
770  case LoopVectorizationLegality::IntegerMult:
771    // Multiplying a number by 1 does not change it.
772    return 1;
773  case LoopVectorizationLegality::IntegerAnd:
774    // AND-ing a number with an all-1 value does not change it.
775    return -1;
776  default:
777    llvm_unreachable("Unknown reduction kind");
778  }
779}
780
781void
782SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
783  //===------------------------------------------------===//
784  //
785  // Notice: any optimization or new instruction that go
786  // into the code below should be also be implemented in
787  // the cost-model.
788  //
789  //===------------------------------------------------===//
790  typedef SmallVector<PHINode*, 4> PhiVector;
791  BasicBlock &BB = *OrigLoop->getHeader();
792  Constant *Zero = ConstantInt::get(
793    IntegerType::getInt32Ty(BB.getContext()), 0);
794
795  // In order to support reduction variables we need to be able to vectorize
796  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
797  // steages. First, we create a new vector PHI node with no incoming edges.
798  // We use this value when we vectorize all of the instructions that use the
799  // PHI. Next, after all of the instructions in the block are complete we
800  // add the new incoming edges to the PHI. At this point all of the
801  // instructions in the basic block are vectorized, so we can use them to
802  // construct the PHI.
803  PhiVector PHIsToFix;
804
805  // For each instruction in the old loop.
806  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
807    Instruction *Inst = it;
808
809    switch (Inst->getOpcode()) {
810      case Instruction::Br:
811        // Nothing to do for PHIs and BR, since we already took care of the
812        // loop control flow instructions.
813        continue;
814      case Instruction::PHI:{
815        PHINode* P = cast<PHINode>(Inst);
816        // Special handling for the induction var.
817        if (OldInduction == Inst)
818          continue;
819        // This is phase one of vectorizing PHIs.
820        // This has to be a reduction variable.
821        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
822        Type *VecTy = VectorType::get(Inst->getType(), VF);
823        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
824        PHIsToFix.push_back(P);
825        continue;
826      }
827      case Instruction::Add:
828      case Instruction::FAdd:
829      case Instruction::Sub:
830      case Instruction::FSub:
831      case Instruction::Mul:
832      case Instruction::FMul:
833      case Instruction::UDiv:
834      case Instruction::SDiv:
835      case Instruction::FDiv:
836      case Instruction::URem:
837      case Instruction::SRem:
838      case Instruction::FRem:
839      case Instruction::Shl:
840      case Instruction::LShr:
841      case Instruction::AShr:
842      case Instruction::And:
843      case Instruction::Or:
844      case Instruction::Xor: {
845        // Just widen binops.
846        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
847        Value *A = getVectorValue(Inst->getOperand(0));
848        Value *B = getVectorValue(Inst->getOperand(1));
849        // Use this vector value for all users of the original instruction.
850        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
851        break;
852      }
853      case Instruction::Select: {
854        // Widen selects.
855        // If the selector is loop invariant we can create a select
856        // instruction with a scalar condition. Otherwise, use vector-select.
857        Value *Cond = Inst->getOperand(0);
858        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
859
860        // The condition can be loop invariant  but still defined inside the
861        // loop. This means that we can't just use the original 'cond' value.
862        // We have to take the 'vectorized' value and pick the first lane.
863        // Instcombine will make this a no-op.
864        Cond = getVectorValue(Cond);
865        if (InvariantCond)
866          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
867
868        Value *Op0 = getVectorValue(Inst->getOperand(1));
869        Value *Op1 = getVectorValue(Inst->getOperand(2));
870        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
871        break;
872      }
873
874      case Instruction::ICmp:
875      case Instruction::FCmp: {
876        // Widen compares. Generate vector compares.
877        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
878        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
879        Value *A = getVectorValue(Inst->getOperand(0));
880        Value *B = getVectorValue(Inst->getOperand(1));
881        if (FCmp)
882          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
883        else
884          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
885        break;
886      }
887
888      case Instruction::Store: {
889        // Attempt to issue a wide store.
890        StoreInst *SI = dyn_cast<StoreInst>(Inst);
891        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
892        Value *Ptr = SI->getPointerOperand();
893        unsigned Alignment = SI->getAlignment();
894        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
895        // This store does not use GEPs.
896        if (!Legal->isConsecutiveGep(Gep)) {
897          scalarizeInstruction(Inst);
898          break;
899        }
900
901        // The last index does not have to be the induction. It can be
902        // consecutive and be a function of the index. For example A[I+1];
903        unsigned NumOperands = Gep->getNumOperands();
904        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
905        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
906
907        // Create the new GEP with the new induction variable.
908        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
909        Gep2->setOperand(NumOperands - 1, LastIndex);
910        Ptr = Builder.Insert(Gep2);
911        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
912        Value *Val = getVectorValue(SI->getValueOperand());
913        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
914        break;
915      }
916      case Instruction::Load: {
917        // Attempt to issue a wide load.
918        LoadInst *LI = dyn_cast<LoadInst>(Inst);
919        Type *RetTy = VectorType::get(LI->getType(), VF);
920        Value *Ptr = LI->getPointerOperand();
921        unsigned Alignment = LI->getAlignment();
922        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
923
924        // We don't have a gep. Scalarize the load.
925        if (!Legal->isConsecutiveGep(Gep)) {
926          scalarizeInstruction(Inst);
927          break;
928        }
929
930        // The last index does not have to be the induction. It can be
931        // consecutive and be a function of the index. For example A[I+1];
932        unsigned NumOperands = Gep->getNumOperands();
933        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
934        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
935
936        // Create the new GEP with the new induction variable.
937        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
938        Gep2->setOperand(NumOperands - 1, LastIndex);
939        Ptr = Builder.Insert(Gep2);
940        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
941        LI = Builder.CreateLoad(Ptr);
942        LI->setAlignment(Alignment);
943        // Use this vector value for all users of the load.
944        WidenMap[Inst] = LI;
945        break;
946      }
947      case Instruction::ZExt:
948      case Instruction::SExt:
949      case Instruction::FPToUI:
950      case Instruction::FPToSI:
951      case Instruction::FPExt:
952      case Instruction::PtrToInt:
953      case Instruction::IntToPtr:
954      case Instruction::SIToFP:
955      case Instruction::UIToFP:
956      case Instruction::Trunc:
957      case Instruction::FPTrunc:
958      case Instruction::BitCast: {
959        /// Vectorize bitcasts.
960        CastInst *CI = dyn_cast<CastInst>(Inst);
961        Value *A = getVectorValue(Inst->getOperand(0));
962        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
963        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
964        break;
965      }
966
967      default:
968        /// All other instructions are unsupported. Scalarize them.
969        scalarizeInstruction(Inst);
970        break;
971    }// end of switch.
972  }// end of for_each instr.
973
974  // At this point every instruction in the original loop is widended to
975  // a vector form. We are almost done. Now, we need to fix the PHI nodes
976  // that we vectorized. The PHI nodes are currently empty because we did
977  // not want to introduce cycles. Notice that the remaining PHI nodes
978  // that we need to fix are reduction variables.
979
980  // Create the 'reduced' values for each of the induction vars.
981  // The reduced values are the vector values that we scalarize and combine
982  // after the loop is finished.
983  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
984       it != e; ++it) {
985    PHINode *RdxPhi = *it;
986    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
987    assert(RdxPhi && "Unable to recover vectorized PHI");
988
989    // Find the reduction variable descriptor.
990    assert(Legal->getReductionVars()->count(RdxPhi) &&
991           "Unable to find the reduction variable");
992    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
993      (*Legal->getReductionVars())[RdxPhi];
994
995    // We need to generate a reduction vector from the incoming scalar.
996    // To do so, we need to generate the 'identity' vector and overide
997    // one of the elements with the incoming scalar reduction. We need
998    // to do it in the vector-loop preheader.
999    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
1000
1001    // This is the vector-clone of the value that leaves the loop.
1002    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
1003    Type *VecTy = VectorExit->getType();
1004
1005    // Find the reduction identity variable. Zero for addition, or, xor,
1006    // one for multiplication, -1 for And.
1007    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
1008                                          VecTy->getScalarType());
1009
1010    // This vector is the Identity vector where the first element is the
1011    // incoming scalar reduction.
1012    Value *VectorStart = Builder.CreateInsertElement(Identity,
1013                                                    RdxDesc.StartValue, Zero);
1014
1015
1016    // Fix the vector-loop phi.
1017    // We created the induction variable so we know that the
1018    // preheader is the first entry.
1019    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
1020
1021    // Reductions do not have to start at zero. They can start with
1022    // any loop invariant values.
1023    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
1024    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
1025    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
1026    VecRdxPhi->addIncoming(Val, LoopVectorBody);
1027
1028    // Before each round, move the insertion point right between
1029    // the PHIs and the values we are going to write.
1030    // This allows us to write both PHINodes and the extractelement
1031    // instructions.
1032    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
1033
1034    // This PHINode contains the vectorized reduction variable, or
1035    // the initial value vector, if we bypass the vector loop.
1036    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
1037    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
1038    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
1039
1040    // Extract the first scalar.
1041    Value *Scalar0 =
1042      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
1043    // Extract and reduce the remaining vector elements.
1044    for (unsigned i=1; i < VF; ++i) {
1045      Value *Scalar1 =
1046        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
1047      switch (RdxDesc.Kind) {
1048        case LoopVectorizationLegality::IntegerAdd:
1049          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
1050          break;
1051        case LoopVectorizationLegality::IntegerMult:
1052          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
1053          break;
1054        case LoopVectorizationLegality::IntegerOr:
1055          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
1056          break;
1057        case LoopVectorizationLegality::IntegerAnd:
1058          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
1059          break;
1060        case LoopVectorizationLegality::IntegerXor:
1061          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
1062          break;
1063        default:
1064          llvm_unreachable("Unknown reduction operation");
1065      }
1066    }
1067
1068    // Now, we need to fix the users of the reduction variable
1069    // inside and outside of the scalar remainder loop.
1070    // We know that the loop is in LCSSA form. We need to update the
1071    // PHI nodes in the exit blocks.
1072    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
1073         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
1074      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
1075      if (!LCSSAPhi) continue;
1076
1077      // All PHINodes need to have a single entry edge, or two if
1078      // we already fixed them.
1079      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
1080
1081      // We found our reduction value exit-PHI. Update it with the
1082      // incoming bypass edge.
1083      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
1084        // Add an edge coming from the bypass.
1085        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
1086        break;
1087      }
1088    }// end of the LCSSA phi scan.
1089
1090    // Fix the scalar loop reduction variable with the incoming reduction sum
1091    // from the vector body and from the backedge value.
1092    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
1093    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
1094    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
1095    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
1096  }// end of for each redux variable.
1097}
1098
1099void SingleBlockLoopVectorizer::updateAnalysis() {
1100  // The original basic block.
1101  SE->forgetLoop(OrigLoop);
1102
1103  // Update the dominator tree information.
1104  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
1105         "Entry does not dominate exit.");
1106
1107  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
1108  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
1109  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
1110  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
1111  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
1112  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
1113
1114  DEBUG(DT->verifyAnalysis());
1115}
1116
1117bool LoopVectorizationLegality::canVectorize() {
1118  if (!TheLoop->getLoopPreheader()) {
1119    assert(false && "No preheader!!");
1120    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
1121    return  false;
1122  }
1123
1124  // We can only vectorize single basic block loops.
1125  unsigned NumBlocks = TheLoop->getNumBlocks();
1126  if (NumBlocks != 1) {
1127    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
1128    return false;
1129  }
1130
1131  // We need to have a loop header.
1132  BasicBlock *BB = TheLoop->getHeader();
1133  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
1134
1135  // Go over each instruction and look at memory deps.
1136  if (!canVectorizeBlock(*BB)) {
1137    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
1138    return false;
1139  }
1140
1141  // ScalarEvolution needs to be able to find the exit count.
1142  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
1143  if (ExitCount == SE->getCouldNotCompute()) {
1144    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
1145    return false;
1146  }
1147
1148  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
1149
1150  // Okay! We can vectorize. At this point we don't have any other mem analysis
1151  // which may limit our maximum vectorization factor, so just return true with
1152  // no restrictions.
1153  return true;
1154}
1155
1156bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
1157  // Scan the instructions in the block and look for hazards.
1158  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1159    Instruction *I = it;
1160
1161    PHINode *Phi = dyn_cast<PHINode>(I);
1162    if (Phi) {
1163      // This should not happen because the loop should be normalized.
1164      if (Phi->getNumIncomingValues() != 2) {
1165        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
1166        return false;
1167      }
1168      // We only look at integer phi nodes.
1169      if (!Phi->getType()->isIntegerTy()) {
1170        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
1171        return false;
1172      }
1173
1174      if (isInductionVariable(Phi)) {
1175        if (Induction) {
1176          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
1177          return false;
1178        }
1179        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
1180        Induction = Phi;
1181        continue;
1182      }
1183      if (AddReductionVar(Phi, IntegerAdd)) {
1184        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
1185        continue;
1186      }
1187      if (AddReductionVar(Phi, IntegerMult)) {
1188        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
1189        continue;
1190      }
1191      if (AddReductionVar(Phi, IntegerOr)) {
1192        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
1193        continue;
1194      }
1195      if (AddReductionVar(Phi, IntegerAnd)) {
1196        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
1197        continue;
1198      }
1199      if (AddReductionVar(Phi, IntegerXor)) {
1200        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
1201        continue;
1202      }
1203
1204      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
1205      return false;
1206    }// end of PHI handling
1207
1208    // We still don't handle functions.
1209    CallInst *CI = dyn_cast<CallInst>(I);
1210    if (CI) {
1211      DEBUG(dbgs() << "LV: Found a call site.\n");
1212      return false;
1213    }
1214
1215    // We do not re-vectorize vectors.
1216    if (!VectorType::isValidElementType(I->getType()) &&
1217        !I->getType()->isVoidTy()) {
1218      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
1219      return false;
1220    }
1221
1222    // Reduction instructions are allowed to have exit users.
1223    // All other instructions must not have external users.
1224    if (!AllowedExit.count(I))
1225      //Check that all of the users of the loop are inside the BB.
1226      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
1227           it != e; ++it) {
1228        Instruction *U = cast<Instruction>(*it);
1229        // This user may be a reduction exit value.
1230        BasicBlock *Parent = U->getParent();
1231        if (Parent != &BB) {
1232          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
1233          return false;
1234        }
1235    }
1236  } // next instr.
1237
1238  if (!Induction) {
1239      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
1240      return false;
1241  }
1242
1243  // Don't vectorize if the memory dependencies do not allow vectorization.
1244  if (!canVectorizeMemory(BB))
1245    return false;
1246
1247  // We now know that the loop is vectorizable!
1248  // Collect variables that will remain uniform after vectorization.
1249  std::vector<Value*> Worklist;
1250
1251  // Start with the conditional branch and walk up the block.
1252  Worklist.push_back(BB.getTerminator()->getOperand(0));
1253
1254  while (Worklist.size()) {
1255    Instruction *I = dyn_cast<Instruction>(Worklist.back());
1256    Worklist.pop_back();
1257    // Look at instructions inside this block.
1258    if (!I) continue;
1259    if (I->getParent() != &BB) continue;
1260
1261    // Stop when reaching PHI nodes.
1262    if (isa<PHINode>(I)) {
1263      assert(I == Induction && "Found a uniform PHI that is not the induction");
1264      break;
1265    }
1266
1267    // This is a known uniform.
1268    Uniforms.insert(I);
1269
1270    // Insert all operands.
1271    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
1272      Worklist.push_back(I->getOperand(i));
1273    }
1274  }
1275
1276  return true;
1277}
1278
1279bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
1280  typedef SmallVector<Value*, 16> ValueVector;
1281  typedef SmallPtrSet<Value*, 16> ValueSet;
1282  // Holds the Load and Store *instructions*.
1283  ValueVector Loads;
1284  ValueVector Stores;
1285
1286  // Scan the BB and collect legal loads and stores.
1287  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1288    Instruction *I = it;
1289
1290    // If this is a load, save it. If this instruction can read from memory
1291    // but is not a load, then we quit. Notice that we don't handle function
1292    // calls that read or write.
1293    if (I->mayReadFromMemory()) {
1294      LoadInst *Ld = dyn_cast<LoadInst>(I);
1295      if (!Ld) return false;
1296      if (!Ld->isSimple()) {
1297        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
1298        return false;
1299      }
1300      Loads.push_back(Ld);
1301      continue;
1302    }
1303
1304    // Save store instructions. Abort if other instructions write to memory.
1305    if (I->mayWriteToMemory()) {
1306      StoreInst *St = dyn_cast<StoreInst>(I);
1307      if (!St) return false;
1308      if (!St->isSimple()) {
1309        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
1310        return false;
1311      }
1312      Stores.push_back(St);
1313    }
1314  } // next instr.
1315
1316  // Now we have two lists that hold the loads and the stores.
1317  // Next, we find the pointers that they use.
1318
1319  // Check if we see any stores. If there are no stores, then we don't
1320  // care if the pointers are *restrict*.
1321  if (!Stores.size()) {
1322        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
1323        return true;
1324  }
1325
1326  // Holds the read and read-write *pointers* that we find.
1327  ValueVector Reads;
1328  ValueVector ReadWrites;
1329
1330  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1331  // multiple times on the same object. If the ptr is accessed twice, once
1332  // for read and once for write, it will only appear once (on the write
1333  // list). This is okay, since we are going to check for conflicts between
1334  // writes and between reads and writes, but not between reads and reads.
1335  ValueSet Seen;
1336
1337  ValueVector::iterator I, IE;
1338  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
1339    StoreInst *ST = dyn_cast<StoreInst>(*I);
1340    assert(ST && "Bad StoreInst");
1341    Value* Ptr = ST->getPointerOperand();
1342    // If we did *not* see this pointer before, insert it to
1343    // the read-write list. At this phase it is only a 'write' list.
1344    if (Seen.insert(Ptr))
1345      ReadWrites.push_back(Ptr);
1346  }
1347
1348  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
1349    LoadInst *LD = dyn_cast<LoadInst>(*I);
1350    assert(LD && "Bad LoadInst");
1351    Value* Ptr = LD->getPointerOperand();
1352    // If we did *not* see this pointer before, insert it to the
1353    // read list. If we *did* see it before, then it is already in
1354    // the read-write list. This allows us to vectorize expressions
1355    // such as A[i] += x;  Because the address of A[i] is a read-write
1356    // pointer. This only works if the index of A[i] is consecutive.
1357    // If the address of i is unknown (for example A[B[i]]) then we may
1358    // read a few words, modify, and write a few words, and some of the
1359    // words may be written to the same address.
1360    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
1361      Reads.push_back(Ptr);
1362  }
1363
1364  // If we write (or read-write) to a single destination and there are no
1365  // other reads in this loop then is it safe to vectorize.
1366  if (ReadWrites.size() == 1 && Reads.size() == 0) {
1367    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
1368    return true;
1369  }
1370
1371  // Now that the pointers are in two lists (Reads and ReadWrites), we
1372  // can check that there are no conflicts between each of the writes and
1373  // between the writes to the reads.
1374  ValueSet WriteObjects;
1375  ValueVector TempObjects;
1376
1377  // Check that the read-writes do not conflict with other read-write
1378  // pointers.
1379  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
1380    GetUnderlyingObjects(*I, TempObjects, DL);
1381    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1382         it != e; ++it) {
1383      if (!isIdentifiedObject(*it)) {
1384        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
1385        return false;
1386      }
1387      if (!WriteObjects.insert(*it)) {
1388        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
1389              << **it <<"\n");
1390        return false;
1391      }
1392    }
1393    TempObjects.clear();
1394  }
1395
1396  /// Check that the reads don't conflict with the read-writes.
1397  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
1398    GetUnderlyingObjects(*I, TempObjects, DL);
1399    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1400         it != e; ++it) {
1401      if (!isIdentifiedObject(*it)) {
1402        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
1403        return false;
1404      }
1405      if (WriteObjects.count(*it)) {
1406        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
1407              << **it <<"\n");
1408        return false;
1409      }
1410    }
1411    TempObjects.clear();
1412  }
1413
1414  // All is okay.
1415  return true;
1416}
1417
1418bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
1419                                                ReductionKind Kind) {
1420  if (Phi->getNumIncomingValues() != 2)
1421    return false;
1422
1423  // Find the possible incoming reduction variable.
1424  BasicBlock *BB = Phi->getParent();
1425  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
1426  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
1427  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
1428
1429  // ExitInstruction is the single value which is used outside the loop.
1430  // We only allow for a single reduction value to be used outside the loop.
1431  // This includes users of the reduction, variables (which form a cycle
1432  // which ends in the phi node).
1433  Instruction *ExitInstruction = 0;
1434
1435  // Iter is our iterator. We start with the PHI node and scan for all of the
1436  // users of this instruction. All users must be instructions which can be
1437  // used as reduction variables (such as ADD). We may have a single
1438  // out-of-block user. They cycle must end with the original PHI.
1439  // Also, we can't have multiple block-local users.
1440  Instruction *Iter = Phi;
1441  while (true) {
1442    // Any reduction instr must be of one of the allowed kinds.
1443    if (!isReductionInstr(Iter, Kind))
1444      return false;
1445
1446    // Did we found a user inside this block ?
1447    bool FoundInBlockUser = false;
1448    // Did we reach the initial PHI node ?
1449    bool FoundStartPHI = false;
1450
1451    // If the instruction has no users then this is a broken
1452    // chain and can't be a reduction variable.
1453    if (Iter->use_empty())
1454      return false;
1455
1456    // For each of the *users* of iter.
1457    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
1458         it != e; ++it) {
1459      Instruction *U = cast<Instruction>(*it);
1460      // We already know that the PHI is a user.
1461      if (U == Phi) {
1462        FoundStartPHI = true;
1463        continue;
1464      }
1465      // Check if we found the exit user.
1466      BasicBlock *Parent = U->getParent();
1467      if (Parent != BB) {
1468        // We must have a single exit instruction.
1469        if (ExitInstruction != 0)
1470          return false;
1471        ExitInstruction = Iter;
1472      }
1473      // We can't have multiple inside users.
1474      if (FoundInBlockUser)
1475        return false;
1476      FoundInBlockUser = true;
1477      Iter = U;
1478    }
1479
1480    // We found a reduction var if we have reached the original
1481    // phi node and we only have a single instruction with out-of-loop
1482    // users.
1483   if (FoundStartPHI && ExitInstruction) {
1484     // This instruction is allowed to have out-of-loop users.
1485     AllowedExit.insert(ExitInstruction);
1486
1487     // Save the description of this reduction variable.
1488     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
1489     Reductions[Phi] = RD;
1490     return true;
1491   }
1492  }
1493}
1494
1495bool
1496LoopVectorizationLegality::isReductionInstr(Instruction *I,
1497                                            ReductionKind Kind) {
1498    switch (I->getOpcode()) {
1499    default:
1500      return false;
1501    case Instruction::PHI:
1502      // possibly.
1503      return true;
1504    case Instruction::Add:
1505    case Instruction::Sub:
1506      return Kind == IntegerAdd;
1507    case Instruction::Mul:
1508    case Instruction::UDiv:
1509    case Instruction::SDiv:
1510      return Kind == IntegerMult;
1511    case Instruction::And:
1512      return Kind == IntegerAnd;
1513    case Instruction::Or:
1514      return Kind == IntegerOr;
1515    case Instruction::Xor:
1516      return Kind == IntegerXor;
1517    }
1518}
1519
1520bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
1521  // Check that the PHI is consecutive and starts at zero.
1522  const SCEV *PhiScev = SE->getSCEV(Phi);
1523  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1524  if (!AR) {
1525    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
1526    return false;
1527  }
1528  const SCEV *Step = AR->getStepRecurrence(*SE);
1529
1530  if (!Step->isOne()) {
1531    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
1532    return false;
1533  }
1534  return true;
1535}
1536
1537unsigned
1538LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
1539  if (!VTTI) {
1540    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
1541    return 1;
1542  }
1543
1544  float Cost = expectedCost(1);
1545  unsigned Width = 1;
1546  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
1547  for (unsigned i=2; i <= VF; i*=2) {
1548    // Notice that the vector loop needs to be executed less times, so
1549    // we need to divide the cost of the vector loops by the width of
1550    // the vector elements.
1551    float VectorCost = expectedCost(i) / (float)i;
1552    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
1553          (int)VectorCost << ".\n");
1554    if (VectorCost < Cost) {
1555      Cost = VectorCost;
1556      Width = i;
1557    }
1558  }
1559
1560  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
1561  return Width;
1562}
1563
1564unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
1565  // We can only estimate the cost of single basic block loops.
1566  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
1567
1568  BasicBlock *BB = TheLoop->getHeader();
1569  unsigned Cost = 0;
1570
1571  // For each instruction in the old loop.
1572  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
1573    Instruction *Inst = it;
1574    unsigned C = getInstructionCost(Inst, VF);
1575    Cost += C;
1576    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
1577          " For instruction: "<< *Inst << "\n");
1578  }
1579
1580  return Cost;
1581}
1582
1583unsigned
1584LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
1585  assert(VTTI && "Invalid vector target transformation info");
1586
1587  // If we know that this instruction will remain uniform, check the cost of
1588  // the scalar version.
1589  if (Legal->isUniformAfterVectorization(I))
1590    VF = 1;
1591
1592  Type *RetTy = I->getType();
1593  Type *VectorTy = ToVectorTy(RetTy, VF);
1594
1595
1596  // TODO: We need to estimate the cost of intrinsic calls.
1597  switch (I->getOpcode()) {
1598    case Instruction::GetElementPtr:
1599      // We mark this instruction as zero-cost because scalar GEPs are usually
1600      // lowered to the intruction addressing mode. At the moment we don't
1601      // generate vector geps.
1602      return 0;
1603    case Instruction::Br: {
1604      return VTTI->getCFInstrCost(I->getOpcode());
1605    }
1606    case Instruction::PHI:
1607      return 0;
1608    case Instruction::Add:
1609    case Instruction::FAdd:
1610    case Instruction::Sub:
1611    case Instruction::FSub:
1612    case Instruction::Mul:
1613    case Instruction::FMul:
1614    case Instruction::UDiv:
1615    case Instruction::SDiv:
1616    case Instruction::FDiv:
1617    case Instruction::URem:
1618    case Instruction::SRem:
1619    case Instruction::FRem:
1620    case Instruction::Shl:
1621    case Instruction::LShr:
1622    case Instruction::AShr:
1623    case Instruction::And:
1624    case Instruction::Or:
1625    case Instruction::Xor: {
1626      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
1627    }
1628    case Instruction::Select: {
1629      SelectInst *SI = cast<SelectInst>(I);
1630      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
1631      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
1632      Type *CondTy = SI->getCondition()->getType();
1633      if (ScalarCond)
1634        CondTy = VectorType::get(CondTy, VF);
1635
1636      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
1637    }
1638    case Instruction::ICmp:
1639    case Instruction::FCmp: {
1640      Type *ValTy = I->getOperand(0)->getType();
1641      VectorTy = ToVectorTy(ValTy, VF);
1642      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
1643    }
1644    case Instruction::Store: {
1645      StoreInst *SI = cast<StoreInst>(I);
1646      Type *ValTy = SI->getValueOperand()->getType();
1647      VectorTy = ToVectorTy(ValTy, VF);
1648
1649      if (VF == 1)
1650        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
1651                              SI->getAlignment(), SI->getPointerAddressSpace());
1652
1653      // Scalarized stores.
1654      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
1655        unsigned Cost = 0;
1656        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1657                                              ValTy);
1658        // The cost of extracting from the value vector.
1659        Cost += VF * (ExtCost);
1660        // The cost of the scalar stores.
1661        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1662                                           ValTy->getScalarType(),
1663                                           SI->getAlignment(),
1664                                           SI->getPointerAddressSpace());
1665        return Cost;
1666      }
1667
1668      // Wide stores.
1669      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
1670                                   SI->getPointerAddressSpace());
1671    }
1672    case Instruction::Load: {
1673      LoadInst *LI = cast<LoadInst>(I);
1674
1675      if (VF == 1)
1676        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
1677                                     LI->getAlignment(),
1678                                     LI->getPointerAddressSpace());
1679
1680      // Scalarized loads.
1681      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
1682        unsigned Cost = 0;
1683        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
1684        // The cost of inserting the loaded value into the result vector.
1685        Cost += VF * (InCost);
1686        // The cost of the scalar stores.
1687        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1688                                           RetTy->getScalarType(),
1689                                           LI->getAlignment(),
1690                                           LI->getPointerAddressSpace());
1691        return Cost;
1692      }
1693
1694      // Wide loads.
1695      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
1696                                   LI->getPointerAddressSpace());
1697    }
1698    case Instruction::ZExt:
1699    case Instruction::SExt:
1700    case Instruction::FPToUI:
1701    case Instruction::FPToSI:
1702    case Instruction::FPExt:
1703    case Instruction::PtrToInt:
1704    case Instruction::IntToPtr:
1705    case Instruction::SIToFP:
1706    case Instruction::UIToFP:
1707    case Instruction::Trunc:
1708    case Instruction::FPTrunc:
1709    case Instruction::BitCast: {
1710      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
1711      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
1712    }
1713    default: {
1714      // We are scalarizing the instruction. Return the cost of the scalar
1715      // instruction, plus the cost of insert and extract into vector
1716      // elements, times the vector width.
1717      unsigned Cost = 0;
1718
1719      bool IsVoid = RetTy->isVoidTy();
1720
1721      unsigned InsCost = (IsVoid ? 0 :
1722                          VTTI->getInstrCost(Instruction::InsertElement,
1723                                             VectorTy));
1724
1725      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1726                                            VectorTy);
1727
1728      // The cost of inserting the results plus extracting each one of the
1729      // operands.
1730      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
1731
1732      // The cost of executing VF copies of the scalar instruction.
1733      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
1734      return Cost;
1735    }
1736  }// end of switch.
1737}
1738
1739Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
1740  if (Scalar->isVoidTy() || VF == 1)
1741    return Scalar;
1742  return VectorType::get(Scalar, VF);
1743}
1744
1745} // namespace
1746
1747char LoopVectorize::ID = 0;
1748static const char lv_name[] = "Loop Vectorization";
1749INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
1750INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1751INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1752INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1753INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
1754
1755namespace llvm {
1756  Pass *createLoopVectorizePass() {
1757    return new LoopVectorize();
1758  }
1759}
1760
1761