LoopVectorize.cpp revision 655d2c5354fcd44c329d99428c7d9196bc78dbad
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9#include "LoopVectorize.h"
10#include "llvm/ADT/StringExtras.h"
11#include "llvm/Analysis/AliasAnalysis.h"
12#include "llvm/Analysis/AliasSetTracker.h"
13#include "llvm/Analysis/Dominators.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/Analysis/LoopIterator.h"
16#include "llvm/Analysis/LoopPass.h"
17#include "llvm/Analysis/ScalarEvolutionExpander.h"
18#include "llvm/Analysis/ScalarEvolutionExpander.h"
19#include "llvm/Analysis/ScalarEvolutionExpressions.h"
20#include "llvm/Analysis/ValueTracking.h"
21#include "llvm/Analysis/Verifier.h"
22#include "llvm/Constants.h"
23#include "llvm/DataLayout.h"
24#include "llvm/DerivedTypes.h"
25#include "llvm/Function.h"
26#include "llvm/Instructions.h"
27#include "llvm/IntrinsicInst.h"
28#include "llvm/LLVMContext.h"
29#include "llvm/Module.h"
30#include "llvm/Pass.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34#include "llvm/TargetTransformInfo.h"
35#include "llvm/Transforms/Scalar.h"
36#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37#include "llvm/Transforms/Utils/Local.h"
38#include "llvm/Transforms/Vectorize.h"
39#include "llvm/Type.h"
40#include "llvm/Value.h"
41
42static cl::opt<unsigned>
43VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
44                    cl::desc("Sets the SIMD width. Zero is autoselect."));
45
46static cl::opt<bool>
47EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
48                   cl::desc("Enable if-conversion during vectorization."));
49
50namespace {
51
52/// The LoopVectorize Pass.
53struct LoopVectorize : public LoopPass {
54  static char ID; // Pass identification, replacement for typeid
55
56  LoopVectorize() : LoopPass(ID) {
57    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
58  }
59
60  ScalarEvolution *SE;
61  DataLayout *DL;
62  LoopInfo *LI;
63  TargetTransformInfo *TTI;
64  DominatorTree *DT;
65
66  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
67    // We only vectorize innermost loops.
68    if (!L->empty())
69      return false;
70
71    SE = &getAnalysis<ScalarEvolution>();
72    DL = getAnalysisIfAvailable<DataLayout>();
73    LI = &getAnalysis<LoopInfo>();
74    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
75    DT = &getAnalysis<DominatorTree>();
76
77    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
78          L->getHeader()->getParent()->getName() << "\"\n");
79
80    // Check if it is legal to vectorize the loop.
81    LoopVectorizationLegality LVL(L, SE, DL, DT);
82    if (!LVL.canVectorize()) {
83      DEBUG(dbgs() << "LV: Not vectorizing.\n");
84      return false;
85    }
86
87    // Select the preffered vectorization factor.
88    unsigned VF = 1;
89    if (VectorizationFactor == 0) {
90      const VectorTargetTransformInfo *VTTI = 0;
91      if (TTI)
92        VTTI = TTI->getVectorTargetTransformInfo();
93      // Use the cost model.
94      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
95      VF = CM.findBestVectorizationFactor();
96
97      if (VF == 1) {
98        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
99        return false;
100      }
101
102    } else {
103      // Use the user command flag.
104      VF = VectorizationFactor;
105    }
106
107    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
108          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
109          "\n");
110
111    // If we decided that it is *legal* to vectorizer the loop then do it.
112    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
113    LB.vectorize(&LVL);
114
115    DEBUG(verifyFunction(*L->getHeader()->getParent()));
116    return true;
117  }
118
119  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
120    LoopPass::getAnalysisUsage(AU);
121    AU.addRequiredID(LoopSimplifyID);
122    AU.addRequiredID(LCSSAID);
123    AU.addRequired<LoopInfo>();
124    AU.addRequired<ScalarEvolution>();
125    AU.addRequired<DominatorTree>();
126    AU.addPreserved<LoopInfo>();
127    AU.addPreserved<DominatorTree>();
128  }
129
130};
131
132}// namespace
133
134//===----------------------------------------------------------------------===//
135// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
136// LoopVectorizationCostModel.
137//===----------------------------------------------------------------------===//
138
139void
140LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
141                                                       Loop *Lp, Value *Ptr) {
142  const SCEV *Sc = SE->getSCEV(Ptr);
143  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
144  assert(AR && "Invalid addrec expression");
145  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
146  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
147  Pointers.push_back(Ptr);
148  Starts.push_back(AR->getStart());
149  Ends.push_back(ScEnd);
150}
151
152Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
153  // Create the types.
154  LLVMContext &C = V->getContext();
155  Type *VTy = VectorType::get(V->getType(), VF);
156  Type *I32 = IntegerType::getInt32Ty(C);
157
158  // Save the current insertion location.
159  Instruction *Loc = Builder.GetInsertPoint();
160
161  // We need to place the broadcast of invariant variables outside the loop.
162  Instruction *Instr = dyn_cast<Instruction>(V);
163  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
164  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
165
166  // Place the code for broadcasting invariant variables in the new preheader.
167  if (Invariant)
168    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
169
170  Constant *Zero = ConstantInt::get(I32, 0);
171  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
172  Value *UndefVal = UndefValue::get(VTy);
173  // Insert the value into a new vector.
174  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
175  // Broadcast the scalar into all locations in the vector.
176  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
177                                            "broadcast");
178
179  // Restore the builder insertion point.
180  if (Invariant)
181    Builder.SetInsertPoint(Loc);
182
183  return Shuf;
184}
185
186Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
187  assert(Val->getType()->isVectorTy() && "Must be a vector");
188  assert(Val->getType()->getScalarType()->isIntegerTy() &&
189         "Elem must be an integer");
190  // Create the types.
191  Type *ITy = Val->getType()->getScalarType();
192  VectorType *Ty = cast<VectorType>(Val->getType());
193  int VLen = Ty->getNumElements();
194  SmallVector<Constant*, 8> Indices;
195
196  // Create a vector of consecutive numbers from zero to VF.
197  for (int i = 0; i < VLen; ++i)
198    Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
199
200  // Add the consecutive indices to the vector value.
201  Constant *Cv = ConstantVector::get(Indices);
202  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
203  return Builder.CreateAdd(Val, Cv, "induction");
204}
205
206bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
207  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
208
209  // If this value is a pointer induction variable we know it is consecutive.
210  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
211  if (Phi && Inductions.count(Phi)) {
212    InductionInfo II = Inductions[Phi];
213    if (PtrInduction == II.IK)
214      return true;
215  }
216
217  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
218  if (!Gep)
219    return false;
220
221  unsigned NumOperands = Gep->getNumOperands();
222  Value *LastIndex = Gep->getOperand(NumOperands - 1);
223
224  // Check that all of the gep indices are uniform except for the last.
225  for (unsigned i = 0; i < NumOperands - 1; ++i)
226    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
227      return false;
228
229  // We can emit wide load/stores only if the last index is the induction
230  // variable.
231  const SCEV *Last = SE->getSCEV(LastIndex);
232  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
233    const SCEV *Step = AR->getStepRecurrence(*SE);
234
235    // The memory is consecutive because the last index is consecutive
236    // and all other indices are loop invariant.
237    if (Step->isOne())
238      return true;
239  }
240
241  return false;
242}
243
244bool LoopVectorizationLegality::isUniform(Value *V) {
245  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
246}
247
248Value *InnerLoopVectorizer::getVectorValue(Value *V) {
249  assert(V != Induction && "The new induction variable should not be used.");
250  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
251  // If we saved a vectorized copy of V, use it.
252  Value *&MapEntry = WidenMap[V];
253  if (MapEntry)
254    return MapEntry;
255
256  // Broadcast V and save the value for future uses.
257  Value *B = getBroadcastInstrs(V);
258  MapEntry = B;
259  return B;
260}
261
262Constant*
263InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
264  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
265}
266
267void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
268  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
269  // Holds vector parameters or scalars, in case of uniform vals.
270  SmallVector<Value*, 8> Params;
271
272  // Find all of the vectorized parameters.
273  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
274    Value *SrcOp = Instr->getOperand(op);
275
276    // If we are accessing the old induction variable, use the new one.
277    if (SrcOp == OldInduction) {
278      Params.push_back(getVectorValue(SrcOp));
279      continue;
280    }
281
282    // Try using previously calculated values.
283    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
284
285    // If the src is an instruction that appeared earlier in the basic block
286    // then it should already be vectorized.
287    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
288      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
289      // The parameter is a vector value from earlier.
290      Params.push_back(WidenMap[SrcInst]);
291    } else {
292      // The parameter is a scalar from outside the loop. Maybe even a constant.
293      Params.push_back(SrcOp);
294    }
295  }
296
297  assert(Params.size() == Instr->getNumOperands() &&
298         "Invalid number of operands");
299
300  // Does this instruction return a value ?
301  bool IsVoidRetTy = Instr->getType()->isVoidTy();
302  Value *VecResults = 0;
303
304  // If we have a return value, create an empty vector. We place the scalarized
305  // instructions in this vector.
306  if (!IsVoidRetTy)
307    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
308
309  // For each scalar that we create:
310  for (unsigned i = 0; i < VF; ++i) {
311    Instruction *Cloned = Instr->clone();
312    if (!IsVoidRetTy)
313      Cloned->setName(Instr->getName() + ".cloned");
314    // Replace the operands of the cloned instrucions with extracted scalars.
315    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
316      Value *Op = Params[op];
317      // Param is a vector. Need to extract the right lane.
318      if (Op->getType()->isVectorTy())
319        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
320      Cloned->setOperand(op, Op);
321    }
322
323    // Place the cloned scalar in the new loop.
324    Builder.Insert(Cloned);
325
326    // If the original scalar returns a value we need to place it in a vector
327    // so that future users will be able to use it.
328    if (!IsVoidRetTy)
329      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
330                                               Builder.getInt32(i));
331  }
332
333  if (!IsVoidRetTy)
334    WidenMap[Instr] = VecResults;
335}
336
337Value*
338InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
339                                     Instruction *Loc) {
340  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
341  Legal->getRuntimePointerCheck();
342
343  if (!PtrRtCheck->Need)
344    return NULL;
345
346  Value *MemoryRuntimeCheck = 0;
347  unsigned NumPointers = PtrRtCheck->Pointers.size();
348  SmallVector<Value* , 2> Starts;
349  SmallVector<Value* , 2> Ends;
350
351  SCEVExpander Exp(*SE, "induction");
352
353  // Use this type for pointer arithmetic.
354  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
355
356  for (unsigned i = 0; i < NumPointers; ++i) {
357    Value *Ptr = PtrRtCheck->Pointers[i];
358    const SCEV *Sc = SE->getSCEV(Ptr);
359
360    if (SE->isLoopInvariant(Sc, OrigLoop)) {
361      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
362            *Ptr <<"\n");
363      Starts.push_back(Ptr);
364      Ends.push_back(Ptr);
365    } else {
366      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
367
368      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
369      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
370      Starts.push_back(Start);
371      Ends.push_back(End);
372    }
373  }
374
375  for (unsigned i = 0; i < NumPointers; ++i) {
376    for (unsigned j = i+1; j < NumPointers; ++j) {
377      Instruction::CastOps Op = Instruction::BitCast;
378      Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
379      Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
380      Value *End0 =   CastInst::Create(Op, Ends[i],   PtrArithTy, "bc", Loc);
381      Value *End1 =   CastInst::Create(Op, Ends[j],   PtrArithTy, "bc", Loc);
382
383      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
384                                    Start0, End1, "bound0", Loc);
385      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
386                                    Start1, End0, "bound1", Loc);
387      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
388                                                 "found.conflict", Loc);
389      if (MemoryRuntimeCheck)
390        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
391                                                    MemoryRuntimeCheck,
392                                                    IsConflict,
393                                                    "conflict.rdx", Loc);
394      else
395        MemoryRuntimeCheck = IsConflict;
396
397    }
398  }
399
400  return MemoryRuntimeCheck;
401}
402
403void
404InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
405  /*
406   In this function we generate a new loop. The new loop will contain
407   the vectorized instructions while the old loop will continue to run the
408   scalar remainder.
409
410   [ ] <-- vector loop bypass.
411   /  |
412   /   v
413   |   [ ]     <-- vector pre header.
414   |    |
415   |    v
416   |   [  ] \
417   |   [  ]_|   <-- vector loop.
418   |    |
419   \   v
420   >[ ]   <--- middle-block.
421   /  |
422   /   v
423   |   [ ]     <--- new preheader.
424   |    |
425   |    v
426   |   [ ] \
427   |   [ ]_|   <-- old scalar loop to handle remainder.
428   \   |
429   \  v
430   >[ ]     <-- exit block.
431   ...
432   */
433
434  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
435  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
436  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
437  assert(ExitBlock && "Must have an exit block");
438
439  // Some loops have a single integer induction variable, while other loops
440  // don't. One example is c++ iterators that often have multiple pointer
441  // induction variables. In the code below we also support a case where we
442  // don't have a single induction variable.
443  OldInduction = Legal->getInduction();
444  Type *IdxTy = OldInduction ? OldInduction->getType() :
445  DL->getIntPtrType(SE->getContext());
446
447  // Find the loop boundaries.
448  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
449  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
450
451  // Get the total trip count from the count by adding 1.
452  ExitCount = SE->getAddExpr(ExitCount,
453                             SE->getConstant(ExitCount->getType(), 1));
454
455  // Expand the trip count and place the new instructions in the preheader.
456  // Notice that the pre-header does not change, only the loop body.
457  SCEVExpander Exp(*SE, "induction");
458
459  // Count holds the overall loop count (N).
460  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
461                                   BypassBlock->getTerminator());
462
463  // The loop index does not have to start at Zero. Find the original start
464  // value from the induction PHI node. If we don't have an induction variable
465  // then we know that it starts at zero.
466  Value *StartIdx = OldInduction ?
467  OldInduction->getIncomingValueForBlock(BypassBlock):
468  ConstantInt::get(IdxTy, 0);
469
470  assert(BypassBlock && "Invalid loop structure");
471
472  // Generate the code that checks in runtime if arrays overlap.
473  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
474                                              BypassBlock->getTerminator());
475
476  // Split the single block loop into the two loop structure described above.
477  BasicBlock *VectorPH =
478  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
479  BasicBlock *VecBody =
480  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
481  BasicBlock *MiddleBlock =
482  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
483  BasicBlock *ScalarPH =
484  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
485
486  // This is the location in which we add all of the logic for bypassing
487  // the new vector loop.
488  Instruction *Loc = BypassBlock->getTerminator();
489
490  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
491  // inside the loop.
492  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
493
494  // Generate the induction variable.
495  Induction = Builder.CreatePHI(IdxTy, 2, "index");
496  Constant *Step = ConstantInt::get(IdxTy, VF);
497
498  // We may need to extend the index in case there is a type mismatch.
499  // We know that the count starts at zero and does not overflow.
500  if (Count->getType() != IdxTy) {
501    // The exit count can be of pointer type. Convert it to the correct
502    // integer type.
503    if (ExitCount->getType()->isPointerTy())
504      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
505    else
506      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
507  }
508
509  // Add the start index to the loop count to get the new end index.
510  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
511
512  // Now we need to generate the expression for N - (N % VF), which is
513  // the part that the vectorized body will execute.
514  Constant *CIVF = ConstantInt::get(IdxTy, VF);
515  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
516  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
517  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
518                                                     "end.idx.rnd.down", Loc);
519
520  // Now, compare the new count to zero. If it is zero skip the vector loop and
521  // jump to the scalar loop.
522  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
523                               IdxEndRoundDown,
524                               StartIdx,
525                               "cmp.zero", Loc);
526
527  // If we are using memory runtime checks, include them in.
528  if (MemoryRuntimeCheck)
529    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
530                                 "CntOrMem", Loc);
531
532  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
533  // Remove the old terminator.
534  Loc->eraseFromParent();
535
536  // We are going to resume the execution of the scalar loop.
537  // Go over all of the induction variables that we found and fix the
538  // PHIs that are left in the scalar version of the loop.
539  // The starting values of PHI nodes depend on the counter of the last
540  // iteration in the vectorized loop.
541  // If we come from a bypass edge then we need to start from the original
542  // start value.
543
544  // This variable saves the new starting index for the scalar loop.
545  PHINode *ResumeIndex = 0;
546  LoopVectorizationLegality::InductionList::iterator I, E;
547  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
548  for (I = List->begin(), E = List->end(); I != E; ++I) {
549    PHINode *OrigPhi = I->first;
550    LoopVectorizationLegality::InductionInfo II = I->second;
551    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
552                                         MiddleBlock->getTerminator());
553    Value *EndValue = 0;
554    switch (II.IK) {
555    case LoopVectorizationLegality::NoInduction:
556      llvm_unreachable("Unknown induction");
557    case LoopVectorizationLegality::IntInduction: {
558      // Handle the integer induction counter:
559      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
560      assert(OrigPhi == OldInduction && "Unknown integer PHI");
561      // We know what the end value is.
562      EndValue = IdxEndRoundDown;
563      // We also know which PHI node holds it.
564      ResumeIndex = ResumeVal;
565      break;
566    }
567    case LoopVectorizationLegality::ReverseIntInduction: {
568      // Convert the CountRoundDown variable to the PHI size.
569      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
570      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
571      Value *CRD = CountRoundDown;
572      if (CRDSize > IISize)
573        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
574                               II.StartValue->getType(),
575                               "tr.crd", BypassBlock->getTerminator());
576      else if (CRDSize < IISize)
577        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
578                               II.StartValue->getType(),
579                               "sext.crd", BypassBlock->getTerminator());
580      // Handle reverse integer induction counter:
581      EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
582                                           BypassBlock->getTerminator());
583      break;
584    }
585    case LoopVectorizationLegality::PtrInduction: {
586      // For pointer induction variables, calculate the offset using
587      // the end index.
588      EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
589                                           "ptr.ind.end",
590                                           BypassBlock->getTerminator());
591      break;
592    }
593    }// end of case
594
595    // The new PHI merges the original incoming value, in case of a bypass,
596    // or the value at the end of the vectorized loop.
597    ResumeVal->addIncoming(II.StartValue, BypassBlock);
598    ResumeVal->addIncoming(EndValue, VecBody);
599
600    // Fix the scalar body counter (PHI node).
601    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
602    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
603  }
604
605  // If we are generating a new induction variable then we also need to
606  // generate the code that calculates the exit value. This value is not
607  // simply the end of the counter because we may skip the vectorized body
608  // in case of a runtime check.
609  if (!OldInduction){
610    assert(!ResumeIndex && "Unexpected resume value found");
611    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
612                                  MiddleBlock->getTerminator());
613    ResumeIndex->addIncoming(StartIdx, BypassBlock);
614    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
615  }
616
617  // Make sure that we found the index where scalar loop needs to continue.
618  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
619         "Invalid resume Index");
620
621  // Add a check in the middle block to see if we have completed
622  // all of the iterations in the first vector loop.
623  // If (N - N%VF) == N, then we *don't* need to run the remainder.
624  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
625                                ResumeIndex, "cmp.n",
626                                MiddleBlock->getTerminator());
627
628  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
629  // Remove the old terminator.
630  MiddleBlock->getTerminator()->eraseFromParent();
631
632  // Create i+1 and fill the PHINode.
633  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
634  Induction->addIncoming(StartIdx, VectorPH);
635  Induction->addIncoming(NextIdx, VecBody);
636  // Create the compare.
637  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
638  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
639
640  // Now we have two terminators. Remove the old one from the block.
641  VecBody->getTerminator()->eraseFromParent();
642
643  // Get ready to start creating new instructions into the vectorized body.
644  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
645
646  // Create and register the new vector loop.
647  Loop* Lp = new Loop();
648  Loop *ParentLoop = OrigLoop->getParentLoop();
649
650  // Insert the new loop into the loop nest and register the new basic blocks.
651  if (ParentLoop) {
652    ParentLoop->addChildLoop(Lp);
653    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
654    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
655    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
656  } else {
657    LI->addTopLevelLoop(Lp);
658  }
659
660  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
661
662  // Save the state.
663  LoopVectorPreHeader = VectorPH;
664  LoopScalarPreHeader = ScalarPH;
665  LoopMiddleBlock = MiddleBlock;
666  LoopExitBlock = ExitBlock;
667  LoopVectorBody = VecBody;
668  LoopScalarBody = OldBasicBlock;
669  LoopBypassBlock = BypassBlock;
670}
671
672/// This function returns the identity element (or neutral element) for
673/// the operation K.
674static unsigned
675getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
676  switch (K) {
677  case LoopVectorizationLegality::IntegerXor:
678  case LoopVectorizationLegality::IntegerAdd:
679  case LoopVectorizationLegality::IntegerOr:
680    // Adding, Xoring, Oring zero to a number does not change it.
681    return 0;
682  case LoopVectorizationLegality::IntegerMult:
683    // Multiplying a number by 1 does not change it.
684    return 1;
685  case LoopVectorizationLegality::IntegerAnd:
686    // AND-ing a number with an all-1 value does not change it.
687    return -1;
688  default:
689    llvm_unreachable("Unknown reduction kind");
690  }
691}
692
693static bool
694isTriviallyVectorizableIntrinsic(Instruction *Inst) {
695  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
696  if (!II)
697    return false;
698  switch (II->getIntrinsicID()) {
699  case Intrinsic::sqrt:
700  case Intrinsic::sin:
701  case Intrinsic::cos:
702  case Intrinsic::exp:
703  case Intrinsic::exp2:
704  case Intrinsic::log:
705  case Intrinsic::log10:
706  case Intrinsic::log2:
707  case Intrinsic::fabs:
708  case Intrinsic::floor:
709  case Intrinsic::ceil:
710  case Intrinsic::trunc:
711  case Intrinsic::rint:
712  case Intrinsic::nearbyint:
713  case Intrinsic::pow:
714  case Intrinsic::fma:
715    return true;
716  default:
717    return false;
718  }
719  return false;
720}
721
722void
723InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
724  //===------------------------------------------------===//
725  //
726  // Notice: any optimization or new instruction that go
727  // into the code below should be also be implemented in
728  // the cost-model.
729  //
730  //===------------------------------------------------===//
731  BasicBlock &BB = *OrigLoop->getHeader();
732  Constant *Zero =
733  ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
734
735  // In order to support reduction variables we need to be able to vectorize
736  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
737  // stages. First, we create a new vector PHI node with no incoming edges.
738  // We use this value when we vectorize all of the instructions that use the
739  // PHI. Next, after all of the instructions in the block are complete we
740  // add the new incoming edges to the PHI. At this point all of the
741  // instructions in the basic block are vectorized, so we can use them to
742  // construct the PHI.
743  PhiVector RdxPHIsToFix;
744
745  // Scan the loop in a topological order to ensure that defs are vectorized
746  // before users.
747  LoopBlocksDFS DFS(OrigLoop);
748  DFS.perform(LI);
749
750  // Vectorize all of the blocks in the original loop.
751  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
752       be = DFS.endRPO(); bb != be; ++bb)
753    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
754
755  // At this point every instruction in the original loop is widened to
756  // a vector form. We are almost done. Now, we need to fix the PHI nodes
757  // that we vectorized. The PHI nodes are currently empty because we did
758  // not want to introduce cycles. Notice that the remaining PHI nodes
759  // that we need to fix are reduction variables.
760
761  // Create the 'reduced' values for each of the induction vars.
762  // The reduced values are the vector values that we scalarize and combine
763  // after the loop is finished.
764  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
765       it != e; ++it) {
766    PHINode *RdxPhi = *it;
767    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
768    assert(RdxPhi && "Unable to recover vectorized PHI");
769
770    // Find the reduction variable descriptor.
771    assert(Legal->getReductionVars()->count(RdxPhi) &&
772           "Unable to find the reduction variable");
773    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
774    (*Legal->getReductionVars())[RdxPhi];
775
776    // We need to generate a reduction vector from the incoming scalar.
777    // To do so, we need to generate the 'identity' vector and overide
778    // one of the elements with the incoming scalar reduction. We need
779    // to do it in the vector-loop preheader.
780    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
781
782    // This is the vector-clone of the value that leaves the loop.
783    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
784    Type *VecTy = VectorExit->getType();
785
786    // Find the reduction identity variable. Zero for addition, or, xor,
787    // one for multiplication, -1 for And.
788    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
789                                          VecTy->getScalarType());
790
791    // This vector is the Identity vector where the first element is the
792    // incoming scalar reduction.
793    Value *VectorStart = Builder.CreateInsertElement(Identity,
794                                                     RdxDesc.StartValue, Zero);
795
796    // Fix the vector-loop phi.
797    // We created the induction variable so we know that the
798    // preheader is the first entry.
799    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
800
801    // Reductions do not have to start at zero. They can start with
802    // any loop invariant values.
803    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
804    Value *Val =
805    getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
806    VecRdxPhi->addIncoming(Val, LoopVectorBody);
807
808    // Before each round, move the insertion point right between
809    // the PHIs and the values we are going to write.
810    // This allows us to write both PHINodes and the extractelement
811    // instructions.
812    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
813
814    // This PHINode contains the vectorized reduction variable, or
815    // the initial value vector, if we bypass the vector loop.
816    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
817    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
818    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
819
820    // Extract the first scalar.
821    Value *Scalar0 =
822    Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
823    // Extract and reduce the remaining vector elements.
824    for (unsigned i=1; i < VF; ++i) {
825      Value *Scalar1 =
826      Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
827      switch (RdxDesc.Kind) {
828      case LoopVectorizationLegality::IntegerAdd:
829        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
830        break;
831      case LoopVectorizationLegality::IntegerMult:
832        Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
833        break;
834      case LoopVectorizationLegality::IntegerOr:
835        Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
836        break;
837      case LoopVectorizationLegality::IntegerAnd:
838        Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
839        break;
840      case LoopVectorizationLegality::IntegerXor:
841        Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
842        break;
843      default:
844        llvm_unreachable("Unknown reduction operation");
845      }
846    }
847
848    // Now, we need to fix the users of the reduction variable
849    // inside and outside of the scalar remainder loop.
850    // We know that the loop is in LCSSA form. We need to update the
851    // PHI nodes in the exit blocks.
852    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
853         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
854      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
855      if (!LCSSAPhi) continue;
856
857      // All PHINodes need to have a single entry edge, or two if
858      // we already fixed them.
859      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
860
861      // We found our reduction value exit-PHI. Update it with the
862      // incoming bypass edge.
863      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
864        // Add an edge coming from the bypass.
865        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
866        break;
867      }
868    }// end of the LCSSA phi scan.
869
870    // Fix the scalar loop reduction variable with the incoming reduction sum
871    // from the vector body and from the backedge value.
872    int IncomingEdgeBlockIdx =
873    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
874    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
875    // Pick the other block.
876    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
877    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
878    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
879  }// end of for each redux variable.
880}
881
882Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
883  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
884         "Invalid edge");
885
886  Value *SrcMask = createBlockInMask(Src);
887
888  // The terminator has to be a branch inst!
889  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
890  assert(BI && "Unexpected terminator found");
891
892  Value *EdgeMask = SrcMask;
893  if (BI->isConditional()) {
894    EdgeMask = getVectorValue(BI->getCondition());
895    if (BI->getSuccessor(0) != Dst)
896      EdgeMask = Builder.CreateNot(EdgeMask);
897  }
898
899  return Builder.CreateAnd(EdgeMask, SrcMask);
900}
901
902Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
903  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
904
905  // Loop incoming mask is all-one.
906  if (OrigLoop->getHeader() == BB) {
907    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
908    return getVectorValue(C);
909  }
910
911  // This is the block mask. We OR all incoming edges, and with zero.
912  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
913  Value *BlockMask = getVectorValue(Zero);
914
915  // For each pred:
916  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
917    BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
918
919  return BlockMask;
920}
921
922void
923InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
924                                          BasicBlock *BB, PhiVector *PV) {
925  Constant *Zero =
926  ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
927
928  // For each instruction in the old loop.
929  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
930    switch (it->getOpcode()) {
931    case Instruction::Br:
932      // Nothing to do for PHIs and BR, since we already took care of the
933      // loop control flow instructions.
934      continue;
935    case Instruction::PHI:{
936      PHINode* P = cast<PHINode>(it);
937      // Handle reduction variables:
938      if (Legal->getReductionVars()->count(P)) {
939        // This is phase one of vectorizing PHIs.
940        Type *VecTy = VectorType::get(it->getType(), VF);
941        WidenMap[it] =
942          PHINode::Create(VecTy, 2, "vec.phi",
943                          LoopVectorBody->getFirstInsertionPt());
944        PV->push_back(P);
945        continue;
946      }
947
948      // Check for PHI nodes that are lowered to vector selects.
949      if (P->getParent() != OrigLoop->getHeader()) {
950        // We know that all PHIs in non header blocks are converted into
951        // selects, so we don't have to worry about the insertion order and we
952        // can just use the builder.
953
954        // At this point we generate the predication tree. There may be
955        // duplications since this is a simple recursive scan, but future
956        // optimizations will clean it up.
957        Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent());
958        WidenMap[P] =
959          Builder.CreateSelect(Cond,
960                               getVectorValue(P->getIncomingValue(0)),
961                               getVectorValue(P->getIncomingValue(1)),
962                               "predphi");
963        continue;
964      }
965
966      // This PHINode must be an induction variable.
967      // Make sure that we know about it.
968      assert(Legal->getInductionVars()->count(P) &&
969             "Not an induction variable");
970
971      LoopVectorizationLegality::InductionInfo II =
972        Legal->getInductionVars()->lookup(P);
973
974      switch (II.IK) {
975      case LoopVectorizationLegality::NoInduction:
976        llvm_unreachable("Unknown induction");
977      case LoopVectorizationLegality::IntInduction: {
978        assert(P == OldInduction && "Unexpected PHI");
979        Value *Broadcasted = getBroadcastInstrs(Induction);
980        // After broadcasting the induction variable we need to make the
981        // vector consecutive by adding 0, 1, 2 ...
982        Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
983        WidenMap[OldInduction] = ConsecutiveInduction;
984        continue;
985      }
986      case LoopVectorizationLegality::ReverseIntInduction:
987      case LoopVectorizationLegality::PtrInduction:
988        // Handle reverse integer and pointer inductions.
989        Value *StartIdx = 0;
990        // If we have a single integer induction variable then use it.
991        // Otherwise, start counting at zero.
992        if (OldInduction) {
993          LoopVectorizationLegality::InductionInfo OldII =
994            Legal->getInductionVars()->lookup(OldInduction);
995          StartIdx = OldII.StartValue;
996        } else {
997          StartIdx = ConstantInt::get(Induction->getType(), 0);
998        }
999        // This is the normalized GEP that starts counting at zero.
1000        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
1001                                                 "normalized.idx");
1002
1003        // Handle the reverse integer induction variable case.
1004        if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
1005          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
1006          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
1007                                                 "resize.norm.idx");
1008          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
1009                                                 "reverse.idx");
1010
1011          // This is a new value so do not hoist it out.
1012          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
1013          // After broadcasting the induction variable we need to make the
1014          // vector consecutive by adding  ... -3, -2, -1, 0.
1015          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
1016                                                             true);
1017          WidenMap[it] = ConsecutiveInduction;
1018          continue;
1019        }
1020
1021        // Handle the pointer induction variable case.
1022        assert(P->getType()->isPointerTy() && "Unexpected type.");
1023
1024        // This is the vector of results. Notice that we don't generate
1025        // vector geps because scalar geps result in better code.
1026        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
1027        for (unsigned int i = 0; i < VF; ++i) {
1028          Constant *Idx = ConstantInt::get(Induction->getType(), i);
1029          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
1030                                               "gep.idx");
1031          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
1032                                             "next.gep");
1033          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
1034                                               Builder.getInt32(i),
1035                                               "insert.gep");
1036        }
1037
1038        WidenMap[it] = VecVal;
1039        continue;
1040      }
1041
1042    }// End of PHI.
1043
1044    case Instruction::Add:
1045    case Instruction::FAdd:
1046    case Instruction::Sub:
1047    case Instruction::FSub:
1048    case Instruction::Mul:
1049    case Instruction::FMul:
1050    case Instruction::UDiv:
1051    case Instruction::SDiv:
1052    case Instruction::FDiv:
1053    case Instruction::URem:
1054    case Instruction::SRem:
1055    case Instruction::FRem:
1056    case Instruction::Shl:
1057    case Instruction::LShr:
1058    case Instruction::AShr:
1059    case Instruction::And:
1060    case Instruction::Or:
1061    case Instruction::Xor: {
1062      // Just widen binops.
1063      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
1064      Value *A = getVectorValue(it->getOperand(0));
1065      Value *B = getVectorValue(it->getOperand(1));
1066
1067      // Use this vector value for all users of the original instruction.
1068      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
1069      WidenMap[it] = V;
1070
1071      // Update the NSW, NUW and Exact flags.
1072      BinaryOperator *VecOp = cast<BinaryOperator>(V);
1073      if (isa<OverflowingBinaryOperator>(BinOp)) {
1074        VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
1075        VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
1076      }
1077      if (isa<PossiblyExactOperator>(VecOp))
1078        VecOp->setIsExact(BinOp->isExact());
1079      break;
1080    }
1081    case Instruction::Select: {
1082      // Widen selects.
1083      // If the selector is loop invariant we can create a select
1084      // instruction with a scalar condition. Otherwise, use vector-select.
1085      Value *Cond = it->getOperand(0);
1086      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
1087
1088      // The condition can be loop invariant  but still defined inside the
1089      // loop. This means that we can't just use the original 'cond' value.
1090      // We have to take the 'vectorized' value and pick the first lane.
1091      // Instcombine will make this a no-op.
1092      Cond = getVectorValue(Cond);
1093      if (InvariantCond)
1094        Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
1095
1096      Value *Op0 = getVectorValue(it->getOperand(1));
1097      Value *Op1 = getVectorValue(it->getOperand(2));
1098      WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
1099      break;
1100    }
1101
1102    case Instruction::ICmp:
1103    case Instruction::FCmp: {
1104      // Widen compares. Generate vector compares.
1105      bool FCmp = (it->getOpcode() == Instruction::FCmp);
1106      CmpInst *Cmp = dyn_cast<CmpInst>(it);
1107      Value *A = getVectorValue(it->getOperand(0));
1108      Value *B = getVectorValue(it->getOperand(1));
1109      if (FCmp)
1110        WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
1111      else
1112        WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
1113      break;
1114    }
1115
1116    case Instruction::Store: {
1117      // Attempt to issue a wide store.
1118      StoreInst *SI = dyn_cast<StoreInst>(it);
1119      Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
1120      Value *Ptr = SI->getPointerOperand();
1121      unsigned Alignment = SI->getAlignment();
1122
1123      assert(!Legal->isUniform(Ptr) &&
1124             "We do not allow storing to uniform addresses");
1125
1126      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1127
1128      // This store does not use GEPs.
1129      if (!Legal->isConsecutivePtr(Ptr)) {
1130        scalarizeInstruction(it);
1131        break;
1132      }
1133
1134      if (Gep) {
1135        // The last index does not have to be the induction. It can be
1136        // consecutive and be a function of the index. For example A[I+1];
1137        unsigned NumOperands = Gep->getNumOperands();
1138        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
1139        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
1140
1141        // Create the new GEP with the new induction variable.
1142        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1143        Gep2->setOperand(NumOperands - 1, LastIndex);
1144        Ptr = Builder.Insert(Gep2);
1145      } else {
1146        // Use the induction element ptr.
1147        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
1148        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
1149      }
1150      Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
1151      Value *Val = getVectorValue(SI->getValueOperand());
1152      Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
1153      break;
1154    }
1155    case Instruction::Load: {
1156      // Attempt to issue a wide load.
1157      LoadInst *LI = dyn_cast<LoadInst>(it);
1158      Type *RetTy = VectorType::get(LI->getType(), VF);
1159      Value *Ptr = LI->getPointerOperand();
1160      unsigned Alignment = LI->getAlignment();
1161      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1162
1163      // If the pointer is loop invariant or if it is non consecutive,
1164      // scalarize the load.
1165      bool Con = Legal->isConsecutivePtr(Ptr);
1166      if (Legal->isUniform(Ptr) || !Con) {
1167        scalarizeInstruction(it);
1168        break;
1169      }
1170
1171      if (Gep) {
1172        // The last index does not have to be the induction. It can be
1173        // consecutive and be a function of the index. For example A[I+1];
1174        unsigned NumOperands = Gep->getNumOperands();
1175        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
1176        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
1177
1178        // Create the new GEP with the new induction variable.
1179        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1180        Gep2->setOperand(NumOperands - 1, LastIndex);
1181        Ptr = Builder.Insert(Gep2);
1182      } else {
1183        // Use the induction element ptr.
1184        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
1185        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
1186      }
1187
1188      Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
1189      LI = Builder.CreateLoad(Ptr);
1190      LI->setAlignment(Alignment);
1191      // Use this vector value for all users of the load.
1192      WidenMap[it] = LI;
1193      break;
1194    }
1195    case Instruction::ZExt:
1196    case Instruction::SExt:
1197    case Instruction::FPToUI:
1198    case Instruction::FPToSI:
1199    case Instruction::FPExt:
1200    case Instruction::PtrToInt:
1201    case Instruction::IntToPtr:
1202    case Instruction::SIToFP:
1203    case Instruction::UIToFP:
1204    case Instruction::Trunc:
1205    case Instruction::FPTrunc:
1206    case Instruction::BitCast: {
1207      CastInst *CI = dyn_cast<CastInst>(it);
1208      /// Optimize the special case where the source is the induction
1209      /// variable. Notice that we can only optimize the 'trunc' case
1210      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
1211      /// c. other casts depend on pointer size.
1212      if (CI->getOperand(0) == OldInduction &&
1213          it->getOpcode() == Instruction::Trunc) {
1214        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
1215                                               CI->getType());
1216        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
1217        WidenMap[it] = getConsecutiveVector(Broadcasted);
1218        break;
1219      }
1220      /// Vectorize casts.
1221      Value *A = getVectorValue(it->getOperand(0));
1222      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
1223      WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
1224      break;
1225    }
1226
1227    case Instruction::Call: {
1228      assert(isTriviallyVectorizableIntrinsic(it));
1229      Module *M = BB->getParent()->getParent();
1230      IntrinsicInst *II = cast<IntrinsicInst>(it);
1231      Intrinsic::ID ID = II->getIntrinsicID();
1232      SmallVector<Value*, 4> Args;
1233      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
1234        Args.push_back(getVectorValue(II->getArgOperand(i)));
1235      Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
1236      Function *F = Intrinsic::getDeclaration(M, ID, Tys);
1237      WidenMap[it] = Builder.CreateCall(F, Args);
1238      break;
1239    }
1240
1241    default:
1242      // All other instructions are unsupported. Scalarize them.
1243      scalarizeInstruction(it);
1244      break;
1245    }// end of switch.
1246  }// end of for_each instr.
1247}
1248
1249void InnerLoopVectorizer::updateAnalysis() {
1250  // Forget the original basic block.
1251  SE->forgetLoop(OrigLoop);
1252
1253  // Update the dominator tree information.
1254  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
1255         "Entry does not dominate exit.");
1256
1257  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
1258  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
1259  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
1260  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
1261  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
1262  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
1263
1264  DEBUG(DT->verifyAnalysis());
1265}
1266
1267bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1268  if (!EnableIfConversion)
1269    return false;
1270
1271  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1272  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
1273
1274  // Collect the blocks that need predication.
1275  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
1276    BasicBlock *BB = LoopBlocks[i];
1277
1278    // We don't support switch statements inside loops.
1279    if (!isa<BranchInst>(BB->getTerminator()))
1280      return false;
1281
1282    // We must have at most two predecessors because we need to convert
1283    // all PHIs to selects.
1284    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
1285    if (Preds > 2)
1286      return false;
1287
1288    // We must be able to predicate all blocks that need to be predicated.
1289    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
1290      return false;
1291  }
1292
1293  // We can if-convert this loop.
1294  return true;
1295}
1296
1297bool LoopVectorizationLegality::canVectorize() {
1298  assert(TheLoop->getLoopPreheader() && "No preheader!!");
1299
1300  // We can only vectorize innermost loops.
1301  if (TheLoop->getSubLoopsVector().size())
1302    return false;
1303
1304  // We must have a single backedge.
1305  if (TheLoop->getNumBackEdges() != 1)
1306    return false;
1307
1308  // We must have a single exiting block.
1309  if (!TheLoop->getExitingBlock())
1310    return false;
1311
1312  unsigned NumBlocks = TheLoop->getNumBlocks();
1313
1314  // Check if we can if-convert non single-bb loops.
1315  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1316    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1317    return false;
1318  }
1319
1320  // We need to have a loop header.
1321  BasicBlock *Latch = TheLoop->getLoopLatch();
1322  DEBUG(dbgs() << "LV: Found a loop: " <<
1323        TheLoop->getHeader()->getName() << "\n");
1324
1325  // ScalarEvolution needs to be able to find the exit count.
1326  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
1327  if (ExitCount == SE->getCouldNotCompute()) {
1328    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
1329    return false;
1330  }
1331
1332  // Do not loop-vectorize loops with a tiny trip count.
1333  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
1334  if (TC > 0u && TC < TinyTripCountThreshold) {
1335    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
1336          "This loop is not worth vectorizing.\n");
1337    return false;
1338  }
1339
1340  // Check if we can vectorize the instructions and CFG in this loop.
1341  if (!canVectorizeInstrs()) {
1342    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1343    return false;
1344  }
1345
1346  // Go over each instruction and look at memory deps.
1347  if (!canVectorizeMemory()) {
1348    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1349    return false;
1350  }
1351
1352  // Collect all of the variables that remain uniform after vectorization.
1353  collectLoopUniforms();
1354
1355  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
1356        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
1357        <<"!\n");
1358
1359  // Okay! We can vectorize. At this point we don't have any other mem analysis
1360  // which may limit our maximum vectorization factor, so just return true with
1361  // no restrictions.
1362  return true;
1363}
1364
1365bool LoopVectorizationLegality::canVectorizeInstrs() {
1366  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
1367  BasicBlock *Header = TheLoop->getHeader();
1368
1369  // For each block in the loop.
1370  for (Loop::block_iterator bb = TheLoop->block_begin(),
1371       be = TheLoop->block_end(); bb != be; ++bb) {
1372
1373    // Scan the instructions in the block and look for hazards.
1374    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
1375         ++it) {
1376
1377      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
1378        // This should not happen because the loop should be normalized.
1379        if (Phi->getNumIncomingValues() != 2) {
1380          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
1381          return false;
1382        }
1383
1384        // Check that this PHI type is allowed.
1385        if (!Phi->getType()->isIntegerTy() &&
1386            !Phi->getType()->isPointerTy()) {
1387          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
1388          return false;
1389        }
1390
1391        // If this PHINode is not in the header block, then we know that we
1392        // can convert it to select during if-conversion. No need to check if
1393        // the PHIs in this block are induction or reduction variables.
1394        if (*bb != Header)
1395          continue;
1396
1397        // This is the value coming from the preheader.
1398        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
1399        // Check if this is an induction variable.
1400        InductionKind IK = isInductionVariable(Phi);
1401
1402        if (NoInduction != IK) {
1403          // Int inductions are special because we only allow one IV.
1404          if (IK == IntInduction) {
1405            if (Induction) {
1406              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
1407              return false;
1408            }
1409            Induction = Phi;
1410          }
1411
1412          DEBUG(dbgs() << "LV: Found an induction variable.\n");
1413          Inductions[Phi] = InductionInfo(StartValue, IK);
1414          continue;
1415        }
1416
1417        if (AddReductionVar(Phi, IntegerAdd)) {
1418          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
1419          continue;
1420        }
1421        if (AddReductionVar(Phi, IntegerMult)) {
1422          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
1423          continue;
1424        }
1425        if (AddReductionVar(Phi, IntegerOr)) {
1426          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
1427          continue;
1428        }
1429        if (AddReductionVar(Phi, IntegerAnd)) {
1430          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
1431          continue;
1432        }
1433        if (AddReductionVar(Phi, IntegerXor)) {
1434          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
1435          continue;
1436        }
1437
1438        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
1439        return false;
1440      }// end of PHI handling
1441
1442      // We still don't handle functions.
1443      CallInst *CI = dyn_cast<CallInst>(it);
1444      if (CI && !isTriviallyVectorizableIntrinsic(it)) {
1445        DEBUG(dbgs() << "LV: Found a call site.\n");
1446        return false;
1447      }
1448
1449      // We do not re-vectorize vectors.
1450      if (!VectorType::isValidElementType(it->getType()) &&
1451          !it->getType()->isVoidTy()) {
1452        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
1453        return false;
1454      }
1455
1456      // Reduction instructions are allowed to have exit users.
1457      // All other instructions must not have external users.
1458      if (!AllowedExit.count(it))
1459        //Check that all of the users of the loop are inside the BB.
1460        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
1461             I != E; ++I) {
1462          Instruction *U = cast<Instruction>(*I);
1463          // This user may be a reduction exit value.
1464          if (!TheLoop->contains(U)) {
1465            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
1466            return false;
1467          }
1468        }
1469    } // next instr.
1470
1471  }
1472
1473  if (!Induction) {
1474    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1475    assert(getInductionVars()->size() && "No induction variables");
1476  }
1477
1478  return true;
1479}
1480
1481void LoopVectorizationLegality::collectLoopUniforms() {
1482  // We now know that the loop is vectorizable!
1483  // Collect variables that will remain uniform after vectorization.
1484  std::vector<Value*> Worklist;
1485  BasicBlock *Latch = TheLoop->getLoopLatch();
1486
1487  // Start with the conditional branch and walk up the block.
1488  Worklist.push_back(Latch->getTerminator()->getOperand(0));
1489
1490  while (Worklist.size()) {
1491    Instruction *I = dyn_cast<Instruction>(Worklist.back());
1492    Worklist.pop_back();
1493
1494    // Look at instructions inside this loop.
1495    // Stop when reaching PHI nodes.
1496    // TODO: we need to follow values all over the loop, not only in this block.
1497    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
1498      continue;
1499
1500    // This is a known uniform.
1501    Uniforms.insert(I);
1502
1503    // Insert all operands.
1504    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
1505      Worklist.push_back(I->getOperand(i));
1506    }
1507  }
1508}
1509
1510bool LoopVectorizationLegality::canVectorizeMemory() {
1511  typedef SmallVector<Value*, 16> ValueVector;
1512  typedef SmallPtrSet<Value*, 16> ValueSet;
1513  // Holds the Load and Store *instructions*.
1514  ValueVector Loads;
1515  ValueVector Stores;
1516  PtrRtCheck.Pointers.clear();
1517  PtrRtCheck.Need = false;
1518
1519  // For each block.
1520  for (Loop::block_iterator bb = TheLoop->block_begin(),
1521       be = TheLoop->block_end(); bb != be; ++bb) {
1522
1523    // Scan the BB and collect legal loads and stores.
1524    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
1525         ++it) {
1526
1527      // If this is a load, save it. If this instruction can read from memory
1528      // but is not a load, then we quit. Notice that we don't handle function
1529      // calls that read or write.
1530      if (it->mayReadFromMemory()) {
1531        LoadInst *Ld = dyn_cast<LoadInst>(it);
1532        if (!Ld) return false;
1533        if (!Ld->isSimple()) {
1534          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
1535          return false;
1536        }
1537        Loads.push_back(Ld);
1538        continue;
1539      }
1540
1541      // Save 'store' instructions. Abort if other instructions write to memory.
1542      if (it->mayWriteToMemory()) {
1543        StoreInst *St = dyn_cast<StoreInst>(it);
1544        if (!St) return false;
1545        if (!St->isSimple()) {
1546          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
1547          return false;
1548        }
1549        Stores.push_back(St);
1550      }
1551    } // next instr.
1552  } // next block.
1553
1554  // Now we have two lists that hold the loads and the stores.
1555  // Next, we find the pointers that they use.
1556
1557  // Check if we see any stores. If there are no stores, then we don't
1558  // care if the pointers are *restrict*.
1559  if (!Stores.size()) {
1560    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
1561    return true;
1562  }
1563
1564  // Holds the read and read-write *pointers* that we find.
1565  ValueVector Reads;
1566  ValueVector ReadWrites;
1567
1568  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1569  // multiple times on the same object. If the ptr is accessed twice, once
1570  // for read and once for write, it will only appear once (on the write
1571  // list). This is okay, since we are going to check for conflicts between
1572  // writes and between reads and writes, but not between reads and reads.
1573  ValueSet Seen;
1574
1575  ValueVector::iterator I, IE;
1576  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
1577    StoreInst *ST = dyn_cast<StoreInst>(*I);
1578    assert(ST && "Bad StoreInst");
1579    Value* Ptr = ST->getPointerOperand();
1580
1581    if (isUniform(Ptr)) {
1582      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
1583      return false;
1584    }
1585
1586    // If we did *not* see this pointer before, insert it to
1587    // the read-write list. At this phase it is only a 'write' list.
1588    if (Seen.insert(Ptr))
1589      ReadWrites.push_back(Ptr);
1590  }
1591
1592  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
1593    LoadInst *LD = dyn_cast<LoadInst>(*I);
1594    assert(LD && "Bad LoadInst");
1595    Value* Ptr = LD->getPointerOperand();
1596    // If we did *not* see this pointer before, insert it to the
1597    // read list. If we *did* see it before, then it is already in
1598    // the read-write list. This allows us to vectorize expressions
1599    // such as A[i] += x;  Because the address of A[i] is a read-write
1600    // pointer. This only works if the index of A[i] is consecutive.
1601    // If the address of i is unknown (for example A[B[i]]) then we may
1602    // read a few words, modify, and write a few words, and some of the
1603    // words may be written to the same address.
1604    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
1605      Reads.push_back(Ptr);
1606  }
1607
1608  // If we write (or read-write) to a single destination and there are no
1609  // other reads in this loop then is it safe to vectorize.
1610  if (ReadWrites.size() == 1 && Reads.size() == 0) {
1611    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
1612    return true;
1613  }
1614
1615  // Find pointers with computable bounds. We are going to use this information
1616  // to place a runtime bound check.
1617  bool RT = true;
1618  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
1619    if (hasComputableBounds(*I)) {
1620      PtrRtCheck.insert(SE, TheLoop, *I);
1621      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
1622    } else {
1623      RT = false;
1624      break;
1625    }
1626  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
1627    if (hasComputableBounds(*I)) {
1628      PtrRtCheck.insert(SE, TheLoop, *I);
1629      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
1630    } else {
1631      RT = false;
1632      break;
1633    }
1634
1635  // Check that we did not collect too many pointers or found a
1636  // unsizeable pointer.
1637  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
1638    PtrRtCheck.reset();
1639    RT = false;
1640  }
1641
1642  PtrRtCheck.Need = RT;
1643
1644  if (RT) {
1645    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
1646  }
1647
1648  // Now that the pointers are in two lists (Reads and ReadWrites), we
1649  // can check that there are no conflicts between each of the writes and
1650  // between the writes to the reads.
1651  ValueSet WriteObjects;
1652  ValueVector TempObjects;
1653
1654  // Check that the read-writes do not conflict with other read-write
1655  // pointers.
1656  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
1657    GetUnderlyingObjects(*I, TempObjects, DL);
1658    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1659         it != e; ++it) {
1660      if (!isIdentifiedObject(*it)) {
1661        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
1662        return RT;
1663      }
1664      if (!WriteObjects.insert(*it)) {
1665        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
1666              << **it <<"\n");
1667        return RT;
1668      }
1669    }
1670    TempObjects.clear();
1671  }
1672
1673  /// Check that the reads don't conflict with the read-writes.
1674  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
1675    GetUnderlyingObjects(*I, TempObjects, DL);
1676    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1677         it != e; ++it) {
1678      if (!isIdentifiedObject(*it)) {
1679        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
1680        return RT;
1681      }
1682      if (WriteObjects.count(*it)) {
1683        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
1684              << **it <<"\n");
1685        return RT;
1686      }
1687    }
1688    TempObjects.clear();
1689  }
1690
1691  // It is safe to vectorize and we don't need any runtime checks.
1692  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
1693  PtrRtCheck.reset();
1694  return true;
1695}
1696
1697bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
1698                                                ReductionKind Kind) {
1699  if (Phi->getNumIncomingValues() != 2)
1700    return false;
1701
1702  // Reduction variables are only found in the loop header block.
1703  if (Phi->getParent() != TheLoop->getHeader())
1704    return false;
1705
1706  // Obtain the reduction start value from the value that comes from the loop
1707  // preheader.
1708  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
1709
1710  // ExitInstruction is the single value which is used outside the loop.
1711  // We only allow for a single reduction value to be used outside the loop.
1712  // This includes users of the reduction, variables (which form a cycle
1713  // which ends in the phi node).
1714  Instruction *ExitInstruction = 0;
1715
1716  // Iter is our iterator. We start with the PHI node and scan for all of the
1717  // users of this instruction. All users must be instructions which can be
1718  // used as reduction variables (such as ADD). We may have a single
1719  // out-of-block user. They cycle must end with the original PHI.
1720  // Also, we can't have multiple block-local users.
1721  Instruction *Iter = Phi;
1722  while (true) {
1723    // If the instruction has no users then this is a broken
1724    // chain and can't be a reduction variable.
1725    if (Iter->use_empty())
1726      return false;
1727
1728    // Any reduction instr must be of one of the allowed kinds.
1729    if (!isReductionInstr(Iter, Kind))
1730      return false;
1731
1732    // Did we find a user inside this block ?
1733    bool FoundInBlockUser = false;
1734    // Did we reach the initial PHI node ?
1735    bool FoundStartPHI = false;
1736
1737    // For each of the *users* of iter.
1738    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
1739         it != e; ++it) {
1740      Instruction *U = cast<Instruction>(*it);
1741      // We already know that the PHI is a user.
1742      if (U == Phi) {
1743        FoundStartPHI = true;
1744        continue;
1745      }
1746
1747      // Check if we found the exit user.
1748      BasicBlock *Parent = U->getParent();
1749      if (!TheLoop->contains(Parent)) {
1750        // Exit if you find multiple outside users.
1751        if (ExitInstruction != 0)
1752          return false;
1753        ExitInstruction = Iter;
1754      }
1755
1756      // We allow in-loop PHINodes which are not the original reduction PHI
1757      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
1758      // structure) then don't skip this PHI.
1759      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
1760          TheLoop->contains(U) && Iter->getNumUses() > 1)
1761        continue;
1762
1763      // We can't have multiple inside users.
1764      if (FoundInBlockUser)
1765        return false;
1766      FoundInBlockUser = true;
1767      Iter = U;
1768    }
1769
1770    // We found a reduction var if we have reached the original
1771    // phi node and we only have a single instruction with out-of-loop
1772    // users.
1773    if (FoundStartPHI && ExitInstruction) {
1774      // This instruction is allowed to have out-of-loop users.
1775      AllowedExit.insert(ExitInstruction);
1776
1777      // Save the description of this reduction variable.
1778      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
1779      Reductions[Phi] = RD;
1780      return true;
1781    }
1782
1783    // If we've reached the start PHI but did not find an outside user then
1784    // this is dead code. Abort.
1785    if (FoundStartPHI)
1786      return false;
1787  }
1788}
1789
1790bool
1791LoopVectorizationLegality::isReductionInstr(Instruction *I,
1792                                            ReductionKind Kind) {
1793  switch (I->getOpcode()) {
1794  default:
1795    return false;
1796  case Instruction::PHI:
1797    // possibly.
1798    return true;
1799  case Instruction::Add:
1800  case Instruction::Sub:
1801    return Kind == IntegerAdd;
1802  case Instruction::Mul:
1803    return Kind == IntegerMult;
1804  case Instruction::And:
1805    return Kind == IntegerAnd;
1806  case Instruction::Or:
1807    return Kind == IntegerOr;
1808  case Instruction::Xor:
1809    return Kind == IntegerXor;
1810  }
1811}
1812
1813LoopVectorizationLegality::InductionKind
1814LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
1815  Type *PhiTy = Phi->getType();
1816  // We only handle integer and pointer inductions variables.
1817  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
1818    return NoInduction;
1819
1820  // Check that the PHI is consecutive and starts at zero.
1821  const SCEV *PhiScev = SE->getSCEV(Phi);
1822  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1823  if (!AR) {
1824    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
1825    return NoInduction;
1826  }
1827  const SCEV *Step = AR->getStepRecurrence(*SE);
1828
1829  // Integer inductions need to have a stride of one.
1830  if (PhiTy->isIntegerTy()) {
1831    if (Step->isOne())
1832      return IntInduction;
1833    if (Step->isAllOnesValue())
1834      return ReverseIntInduction;
1835    return NoInduction;
1836  }
1837
1838  // Calculate the pointer stride and check if it is consecutive.
1839  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1840  if (!C)
1841    return NoInduction;
1842
1843  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
1844  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
1845  if (C->getValue()->equalsInt(Size))
1846    return PtrInduction;
1847
1848  return NoInduction;
1849}
1850
1851bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
1852  assert(TheLoop->contains(BB) && "Unknown block used");
1853
1854  // Blocks that do not dominate the latch need predication.
1855  BasicBlock* Latch = TheLoop->getLoopLatch();
1856  return !DT->dominates(BB, Latch);
1857}
1858
1859bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
1860  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
1861    // We don't predicate loads/stores at the moment.
1862    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
1863      return false;
1864
1865    // The isntructions below can trap.
1866    switch (it->getOpcode()) {
1867    default: continue;
1868    case Instruction::UDiv:
1869    case Instruction::SDiv:
1870    case Instruction::URem:
1871    case Instruction::SRem:
1872             return false;
1873    }
1874  }
1875
1876  return true;
1877}
1878
1879bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
1880  const SCEV *PhiScev = SE->getSCEV(Ptr);
1881  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1882  if (!AR)
1883    return false;
1884
1885  return AR->isAffine();
1886}
1887
1888unsigned
1889LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
1890  if (!VTTI) {
1891    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
1892    return 1;
1893  }
1894
1895  float Cost = expectedCost(1);
1896  unsigned Width = 1;
1897  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
1898  for (unsigned i=2; i <= VF; i*=2) {
1899    // Notice that the vector loop needs to be executed less times, so
1900    // we need to divide the cost of the vector loops by the width of
1901    // the vector elements.
1902    float VectorCost = expectedCost(i) / (float)i;
1903    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
1904          (int)VectorCost << ".\n");
1905    if (VectorCost < Cost) {
1906      Cost = VectorCost;
1907      Width = i;
1908    }
1909  }
1910
1911  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
1912  return Width;
1913}
1914
1915unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
1916  unsigned Cost = 0;
1917
1918  // For each block.
1919  for (Loop::block_iterator bb = TheLoop->block_begin(),
1920       be = TheLoop->block_end(); bb != be; ++bb) {
1921    unsigned BlockCost = 0;
1922    BasicBlock *BB = *bb;
1923
1924    // For each instruction in the old loop.
1925    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
1926      unsigned C = getInstructionCost(it, VF);
1927      Cost += C;
1928      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
1929            VF << " For instruction: "<< *it << "\n");
1930    }
1931
1932    // We assume that if-converted blocks have a 50% chance of being executed.
1933    // When the code is scalar then some of the blocks are avoided due to CF.
1934    // When the code is vectorized we execute all code paths.
1935    if (Legal->blockNeedsPredication(*bb) && VF == 1)
1936      BlockCost /= 2;
1937
1938    Cost += BlockCost;
1939  }
1940
1941  return Cost;
1942}
1943
1944unsigned
1945LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
1946  assert(VTTI && "Invalid vector target transformation info");
1947
1948  // If we know that this instruction will remain uniform, check the cost of
1949  // the scalar version.
1950  if (Legal->isUniformAfterVectorization(I))
1951    VF = 1;
1952
1953  Type *RetTy = I->getType();
1954  Type *VectorTy = ToVectorTy(RetTy, VF);
1955
1956  // TODO: We need to estimate the cost of intrinsic calls.
1957  switch (I->getOpcode()) {
1958  case Instruction::GetElementPtr:
1959    // We mark this instruction as zero-cost because scalar GEPs are usually
1960    // lowered to the intruction addressing mode. At the moment we don't
1961    // generate vector geps.
1962    return 0;
1963  case Instruction::Br: {
1964    return VTTI->getCFInstrCost(I->getOpcode());
1965  }
1966  case Instruction::PHI:
1967    //TODO: IF-converted IFs become selects.
1968    return 0;
1969  case Instruction::Add:
1970  case Instruction::FAdd:
1971  case Instruction::Sub:
1972  case Instruction::FSub:
1973  case Instruction::Mul:
1974  case Instruction::FMul:
1975  case Instruction::UDiv:
1976  case Instruction::SDiv:
1977  case Instruction::FDiv:
1978  case Instruction::URem:
1979  case Instruction::SRem:
1980  case Instruction::FRem:
1981  case Instruction::Shl:
1982  case Instruction::LShr:
1983  case Instruction::AShr:
1984  case Instruction::And:
1985  case Instruction::Or:
1986  case Instruction::Xor:
1987    return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
1988  case Instruction::Select: {
1989    SelectInst *SI = cast<SelectInst>(I);
1990    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
1991    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
1992    Type *CondTy = SI->getCondition()->getType();
1993    if (ScalarCond)
1994      CondTy = VectorType::get(CondTy, VF);
1995
1996    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
1997  }
1998  case Instruction::ICmp:
1999  case Instruction::FCmp: {
2000    Type *ValTy = I->getOperand(0)->getType();
2001    VectorTy = ToVectorTy(ValTy, VF);
2002    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
2003  }
2004  case Instruction::Store: {
2005    StoreInst *SI = cast<StoreInst>(I);
2006    Type *ValTy = SI->getValueOperand()->getType();
2007    VectorTy = ToVectorTy(ValTy, VF);
2008
2009    if (VF == 1)
2010      return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
2011                                   SI->getAlignment(),
2012                                   SI->getPointerAddressSpace());
2013
2014    // Scalarized stores.
2015    if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
2016      unsigned Cost = 0;
2017      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
2018                                            ValTy);
2019      // The cost of extracting from the value vector.
2020      Cost += VF * (ExtCost);
2021      // The cost of the scalar stores.
2022      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
2023                                         ValTy->getScalarType(),
2024                                         SI->getAlignment(),
2025                                         SI->getPointerAddressSpace());
2026      return Cost;
2027    }
2028
2029    // Wide stores.
2030    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
2031                                 SI->getPointerAddressSpace());
2032  }
2033  case Instruction::Load: {
2034    LoadInst *LI = cast<LoadInst>(I);
2035
2036    if (VF == 1)
2037      return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
2038                                   LI->getAlignment(),
2039                                   LI->getPointerAddressSpace());
2040
2041    // Scalarized loads.
2042    if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
2043      unsigned Cost = 0;
2044      unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
2045      // The cost of inserting the loaded value into the result vector.
2046      Cost += VF * (InCost);
2047      // The cost of the scalar stores.
2048      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
2049                                         RetTy->getScalarType(),
2050                                         LI->getAlignment(),
2051                                         LI->getPointerAddressSpace());
2052      return Cost;
2053    }
2054
2055    // Wide loads.
2056    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
2057                                 LI->getPointerAddressSpace());
2058  }
2059  case Instruction::ZExt:
2060  case Instruction::SExt:
2061  case Instruction::FPToUI:
2062  case Instruction::FPToSI:
2063  case Instruction::FPExt:
2064  case Instruction::PtrToInt:
2065  case Instruction::IntToPtr:
2066  case Instruction::SIToFP:
2067  case Instruction::UIToFP:
2068  case Instruction::Trunc:
2069  case Instruction::FPTrunc:
2070  case Instruction::BitCast: {
2071    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
2072    return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
2073  }
2074  case Instruction::Call: {
2075    assert(isTriviallyVectorizableIntrinsic(I));
2076    IntrinsicInst *II = cast<IntrinsicInst>(I);
2077    Type *RetTy = ToVectorTy(II->getType(), VF);
2078    SmallVector<Type*, 4> Tys;
2079    for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
2080      Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
2081    return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
2082  }
2083  default: {
2084    // We are scalarizing the instruction. Return the cost of the scalar
2085    // instruction, plus the cost of insert and extract into vector
2086    // elements, times the vector width.
2087    unsigned Cost = 0;
2088
2089    bool IsVoid = RetTy->isVoidTy();
2090
2091    unsigned InsCost = (IsVoid ? 0 :
2092                        VTTI->getInstrCost(Instruction::InsertElement,
2093                                           VectorTy));
2094
2095    unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
2096                                          VectorTy);
2097
2098    // The cost of inserting the results plus extracting each one of the
2099    // operands.
2100    Cost += VF * (InsCost + ExtCost * I->getNumOperands());
2101
2102    // The cost of executing VF copies of the scalar instruction.
2103    Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
2104    return Cost;
2105  }
2106  }// end of switch.
2107}
2108
2109Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
2110  if (Scalar->isVoidTy() || VF == 1)
2111    return Scalar;
2112  return VectorType::get(Scalar, VF);
2113}
2114
2115char LoopVectorize::ID = 0;
2116static const char lv_name[] = "Loop Vectorization";
2117INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
2118INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
2119INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
2120INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
2121INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
2122
2123namespace llvm {
2124  Pass *createLoopVectorizePass() {
2125    return new LoopVectorize();
2126  }
2127}
2128
2129
2130