LoopVectorize.cpp revision 3f4f420ab7acb10221ba971543a7eed5489fb626
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11// and generates target-independent LLVM-IR.
12// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13// of instructions in order to estimate the profitability of vectorization.
14//
15// The loop vectorizer combines consecutive loop iterations into a single
16// 'wide' iteration. After this transformation the index is incremented
17// by the SIMD vector width, and not by one.
18//
19// This pass has three parts:
20// 1. The main loop pass that drives the different parts.
21// 2. LoopVectorizationLegality - A unit that checks for the legality
22//    of the vectorization.
23// 3. InnerLoopVectorizer - A unit that performs the actual
24//    widening of instructions.
25// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26//    of vectorization. It decides on the optimal vector width, which
27//    can be one, if vectorization is not profitable.
28//
29//===----------------------------------------------------------------------===//
30//
31// The reduction-variable vectorization is based on the paper:
32//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33//
34// Variable uniformity checks are inspired by:
35//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
36//
37// Other ideas/concepts are from:
38//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
39//
40//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
41//  Vectorizing Compilers.
42//
43//===----------------------------------------------------------------------===//
44
45#define LV_NAME "loop-vectorize"
46#define DEBUG_TYPE LV_NAME
47
48#include "llvm/Transforms/Vectorize.h"
49#include "llvm/ADT/DenseMap.h"
50#include "llvm/ADT/EquivalenceClasses.h"
51#include "llvm/ADT/MapVector.h"
52#include "llvm/ADT/SetVector.h"
53#include "llvm/ADT/SmallPtrSet.h"
54#include "llvm/ADT/SmallSet.h"
55#include "llvm/ADT/SmallVector.h"
56#include "llvm/ADT/StringExtras.h"
57#include "llvm/Analysis/AliasAnalysis.h"
58#include "llvm/Analysis/Dominators.h"
59#include "llvm/Analysis/LoopInfo.h"
60#include "llvm/Analysis/LoopIterator.h"
61#include "llvm/Analysis/LoopPass.h"
62#include "llvm/Analysis/ScalarEvolution.h"
63#include "llvm/Analysis/ScalarEvolutionExpander.h"
64#include "llvm/Analysis/ScalarEvolutionExpressions.h"
65#include "llvm/Analysis/TargetTransformInfo.h"
66#include "llvm/Analysis/ValueTracking.h"
67#include "llvm/Analysis/Verifier.h"
68#include "llvm/IR/Constants.h"
69#include "llvm/IR/DataLayout.h"
70#include "llvm/IR/DerivedTypes.h"
71#include "llvm/IR/Function.h"
72#include "llvm/IR/IRBuilder.h"
73#include "llvm/IR/Instructions.h"
74#include "llvm/IR/IntrinsicInst.h"
75#include "llvm/IR/LLVMContext.h"
76#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Value.h"
79#include "llvm/Pass.h"
80#include "llvm/Support/CommandLine.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/PatternMatch.h"
83#include "llvm/Support/raw_ostream.h"
84#include "llvm/Support/ValueHandle.h"
85#include "llvm/Target/TargetLibraryInfo.h"
86#include "llvm/Transforms/Scalar.h"
87#include "llvm/Transforms/Utils/BasicBlockUtils.h"
88#include "llvm/Transforms/Utils/Local.h"
89#include <algorithm>
90#include <map>
91
92using namespace llvm;
93using namespace llvm::PatternMatch;
94
95static cl::opt<unsigned>
96VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
97                    cl::desc("Sets the SIMD width. Zero is autoselect."));
98
99static cl::opt<unsigned>
100VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
101                    cl::desc("Sets the vectorization unroll count. "
102                             "Zero is autoselect."));
103
104static cl::opt<bool>
105EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
106                   cl::desc("Enable if-conversion during vectorization."));
107
108/// We don't vectorize loops with a known constant trip count below this number.
109static cl::opt<unsigned>
110TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
111                             cl::Hidden,
112                             cl::desc("Don't vectorize loops with a constant "
113                                      "trip count that is smaller than this "
114                                      "value."));
115
116/// We don't unroll loops with a known constant trip count below this number.
117static const unsigned TinyTripCountUnrollThreshold = 128;
118
119/// When performing memory disambiguation checks at runtime do not make more
120/// than this number of comparisons.
121static const unsigned RuntimeMemoryCheckThreshold = 8;
122
123/// Maximum simd width.
124static const unsigned MaxVectorWidth = 64;
125
126/// Maximum vectorization unroll count.
127static const unsigned MaxUnrollFactor = 16;
128
129/// The cost of a loop that is considered 'small' by the unroller.
130static const unsigned SmallLoopCost = 20;
131
132namespace {
133
134// Forward declarations.
135class LoopVectorizationLegality;
136class LoopVectorizationCostModel;
137
138/// InnerLoopVectorizer vectorizes loops which contain only one basic
139/// block to a specified vectorization factor (VF).
140/// This class performs the widening of scalars into vectors, or multiple
141/// scalars. This class also implements the following features:
142/// * It inserts an epilogue loop for handling loops that don't have iteration
143///   counts that are known to be a multiple of the vectorization factor.
144/// * It handles the code generation for reduction variables.
145/// * Scalarization (implementation using scalars) of un-vectorizable
146///   instructions.
147/// InnerLoopVectorizer does not perform any vectorization-legality
148/// checks, and relies on the caller to check for the different legality
149/// aspects. The InnerLoopVectorizer relies on the
150/// LoopVectorizationLegality class to provide information about the induction
151/// and reduction variables that were found to a given vectorization factor.
152class InnerLoopVectorizer {
153public:
154  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
155                      DominatorTree *DT, DataLayout *DL,
156                      const TargetLibraryInfo *TLI, unsigned VecWidth,
157                      unsigned UnrollFactor)
158      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
159        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
160        OldInduction(0), WidenMap(UnrollFactor) {}
161
162  // Perform the actual loop widening (vectorization).
163  void vectorize(LoopVectorizationLegality *Legal) {
164    // Create a new empty loop. Unlink the old loop and connect the new one.
165    createEmptyLoop(Legal);
166    // Widen each instruction in the old loop to a new one in the new loop.
167    // Use the Legality module to find the induction and reduction variables.
168    vectorizeLoop(Legal);
169    // Register the new loop and update the analysis passes.
170    updateAnalysis();
171  }
172
173  virtual ~InnerLoopVectorizer() {}
174
175protected:
176  /// A small list of PHINodes.
177  typedef SmallVector<PHINode*, 4> PhiVector;
178  /// When we unroll loops we have multiple vector values for each scalar.
179  /// This data structure holds the unrolled and vectorized values that
180  /// originated from one scalar instruction.
181  typedef SmallVector<Value*, 2> VectorParts;
182
183  // When we if-convert we need create edge masks. We have to cache values so
184  // that we don't end up with exponential recursion/IR.
185  typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
186                   VectorParts> EdgeMaskCache;
187
188  /// Add code that checks at runtime if the accessed arrays overlap.
189  /// Returns the comparator value or NULL if no check is needed.
190  Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
191                               Instruction *Loc);
192  /// Create an empty loop, based on the loop ranges of the old loop.
193  void createEmptyLoop(LoopVectorizationLegality *Legal);
194  /// Copy and widen the instructions from the old loop.
195  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
196
197  /// \brief The Loop exit block may have single value PHI nodes where the
198  /// incoming value is 'Undef'. While vectorizing we only handled real values
199  /// that were defined inside the loop. Here we fix the 'undef case'.
200  /// See PR14725.
201  void fixLCSSAPHIs();
202
203  /// A helper function that computes the predicate of the block BB, assuming
204  /// that the header block of the loop is set to True. It returns the *entry*
205  /// mask for the block BB.
206  VectorParts createBlockInMask(BasicBlock *BB);
207  /// A helper function that computes the predicate of the edge between SRC
208  /// and DST.
209  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
210
211  /// A helper function to vectorize a single BB within the innermost loop.
212  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
213                            PhiVector *PV);
214
215  /// Vectorize a single PHINode in a block. This method handles the induction
216  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
217  /// arbitrary length vectors.
218  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
219                           LoopVectorizationLegality *Legal,
220                           unsigned UF, unsigned VF, PhiVector *PV);
221
222  /// Insert the new loop to the loop hierarchy and pass manager
223  /// and update the analysis passes.
224  void updateAnalysis();
225
226  /// This instruction is un-vectorizable. Implement it as a sequence
227  /// of scalars.
228  virtual void scalarizeInstruction(Instruction *Instr);
229
230  /// Vectorize Load and Store instructions,
231  virtual void vectorizeMemoryInstruction(Instruction *Instr,
232                                  LoopVectorizationLegality *Legal);
233
234  /// Create a broadcast instruction. This method generates a broadcast
235  /// instruction (shuffle) for loop invariant values and for the induction
236  /// value. If this is the induction variable then we extend it to N, N+1, ...
237  /// this is needed because each iteration in the loop corresponds to a SIMD
238  /// element.
239  virtual Value *getBroadcastInstrs(Value *V);
240
241  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
242  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
243  /// The sequence starts at StartIndex.
244  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
245
246  /// When we go over instructions in the basic block we rely on previous
247  /// values within the current basic block or on loop invariant values.
248  /// When we widen (vectorize) values we place them in the map. If the values
249  /// are not within the map, they have to be loop invariant, so we simply
250  /// broadcast them into a vector.
251  VectorParts &getVectorValue(Value *V);
252
253  /// Generate a shuffle sequence that will reverse the vector Vec.
254  virtual Value *reverseVector(Value *Vec);
255
256  /// This is a helper class that holds the vectorizer state. It maps scalar
257  /// instructions to vector instructions. When the code is 'unrolled' then
258  /// then a single scalar value is mapped to multiple vector parts. The parts
259  /// are stored in the VectorPart type.
260  struct ValueMap {
261    /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
262    /// are mapped.
263    ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
264
265    /// \return True if 'Key' is saved in the Value Map.
266    bool has(Value *Key) const { return MapStorage.count(Key); }
267
268    /// Initializes a new entry in the map. Sets all of the vector parts to the
269    /// save value in 'Val'.
270    /// \return A reference to a vector with splat values.
271    VectorParts &splat(Value *Key, Value *Val) {
272      VectorParts &Entry = MapStorage[Key];
273      Entry.assign(UF, Val);
274      return Entry;
275    }
276
277    ///\return A reference to the value that is stored at 'Key'.
278    VectorParts &get(Value *Key) {
279      VectorParts &Entry = MapStorage[Key];
280      if (Entry.empty())
281        Entry.resize(UF);
282      assert(Entry.size() == UF);
283      return Entry;
284    }
285
286  private:
287    /// The unroll factor. Each entry in the map stores this number of vector
288    /// elements.
289    unsigned UF;
290
291    /// Map storage. We use std::map and not DenseMap because insertions to a
292    /// dense map invalidates its iterators.
293    std::map<Value *, VectorParts> MapStorage;
294  };
295
296  /// The original loop.
297  Loop *OrigLoop;
298  /// Scev analysis to use.
299  ScalarEvolution *SE;
300  /// Loop Info.
301  LoopInfo *LI;
302  /// Dominator Tree.
303  DominatorTree *DT;
304  /// Data Layout.
305  DataLayout *DL;
306  /// Target Library Info.
307  const TargetLibraryInfo *TLI;
308
309  /// The vectorization SIMD factor to use. Each vector will have this many
310  /// vector elements.
311  unsigned VF;
312
313protected:
314  /// The vectorization unroll factor to use. Each scalar is vectorized to this
315  /// many different vector instructions.
316  unsigned UF;
317
318  /// The builder that we use
319  IRBuilder<> Builder;
320
321  // --- Vectorization state ---
322
323  /// The vector-loop preheader.
324  BasicBlock *LoopVectorPreHeader;
325  /// The scalar-loop preheader.
326  BasicBlock *LoopScalarPreHeader;
327  /// Middle Block between the vector and the scalar.
328  BasicBlock *LoopMiddleBlock;
329  ///The ExitBlock of the scalar loop.
330  BasicBlock *LoopExitBlock;
331  ///The vector loop body.
332  BasicBlock *LoopVectorBody;
333  ///The scalar loop body.
334  BasicBlock *LoopScalarBody;
335  /// A list of all bypass blocks. The first block is the entry of the loop.
336  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
337
338  /// The new Induction variable which was added to the new block.
339  PHINode *Induction;
340  /// The induction variable of the old basic block.
341  PHINode *OldInduction;
342  /// Holds the extended (to the widest induction type) start index.
343  Value *ExtendedIdx;
344  /// Maps scalars to widened vectors.
345  ValueMap WidenMap;
346  EdgeMaskCache MaskCache;
347};
348
349class InnerLoopUnroller : public InnerLoopVectorizer {
350public:
351  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
352                    DominatorTree *DT, DataLayout *DL,
353                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
354    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
355
356private:
357  virtual void scalarizeInstruction(Instruction *Instr);
358  virtual void vectorizeMemoryInstruction(Instruction *Instr,
359                                          LoopVectorizationLegality *Legal);
360  virtual Value *getBroadcastInstrs(Value *V);
361  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
362  virtual Value *reverseVector(Value *Vec);
363};
364
365/// \brief Look for a meaningful debug location on the instruction or it's
366/// operands.
367static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
368  if (!I)
369    return I;
370
371  DebugLoc Empty;
372  if (I->getDebugLoc() != Empty)
373    return I;
374
375  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
376    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
377      if (OpInst->getDebugLoc() != Empty)
378        return OpInst;
379  }
380
381  return I;
382}
383
384/// \brief Set the debug location in the builder using the debug location in the
385/// instruction.
386static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
387  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
388    B.SetCurrentDebugLocation(Inst->getDebugLoc());
389  else
390    B.SetCurrentDebugLocation(DebugLoc());
391}
392
393/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
394/// to what vectorization factor.
395/// This class does not look at the profitability of vectorization, only the
396/// legality. This class has two main kinds of checks:
397/// * Memory checks - The code in canVectorizeMemory checks if vectorization
398///   will change the order of memory accesses in a way that will change the
399///   correctness of the program.
400/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
401/// checks for a number of different conditions, such as the availability of a
402/// single induction variable, that all types are supported and vectorize-able,
403/// etc. This code reflects the capabilities of InnerLoopVectorizer.
404/// This class is also used by InnerLoopVectorizer for identifying
405/// induction variable and the different reduction variables.
406class LoopVectorizationLegality {
407public:
408  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
409                            DominatorTree *DT, TargetLibraryInfo *TLI)
410      : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
411        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
412        MaxSafeDepDistBytes(-1U) {}
413
414  /// This enum represents the kinds of reductions that we support.
415  enum ReductionKind {
416    RK_NoReduction, ///< Not a reduction.
417    RK_IntegerAdd,  ///< Sum of integers.
418    RK_IntegerMult, ///< Product of integers.
419    RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
420    RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
421    RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
422    RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
423    RK_FloatAdd,    ///< Sum of floats.
424    RK_FloatMult,   ///< Product of floats.
425    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
426  };
427
428  /// This enum represents the kinds of inductions that we support.
429  enum InductionKind {
430    IK_NoInduction,         ///< Not an induction variable.
431    IK_IntInduction,        ///< Integer induction variable. Step = 1.
432    IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
433    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
434    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
435  };
436
437  // This enum represents the kind of minmax reduction.
438  enum MinMaxReductionKind {
439    MRK_Invalid,
440    MRK_UIntMin,
441    MRK_UIntMax,
442    MRK_SIntMin,
443    MRK_SIntMax,
444    MRK_FloatMin,
445    MRK_FloatMax
446  };
447
448  /// This POD struct holds information about reduction variables.
449  struct ReductionDescriptor {
450    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
451      Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
452
453    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
454                        MinMaxReductionKind MK)
455        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
456
457    // The starting value of the reduction.
458    // It does not have to be zero!
459    TrackingVH<Value> StartValue;
460    // The instruction who's value is used outside the loop.
461    Instruction *LoopExitInstr;
462    // The kind of the reduction.
463    ReductionKind Kind;
464    // If this a min/max reduction the kind of reduction.
465    MinMaxReductionKind MinMaxKind;
466  };
467
468  /// This POD struct holds information about a potential reduction operation.
469  struct ReductionInstDesc {
470    ReductionInstDesc(bool IsRedux, Instruction *I) :
471      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
472
473    ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
474      IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
475
476    // Is this instruction a reduction candidate.
477    bool IsReduction;
478    // The last instruction in a min/max pattern (select of the select(icmp())
479    // pattern), or the current reduction instruction otherwise.
480    Instruction *PatternLastInst;
481    // If this is a min/max pattern the comparison predicate.
482    MinMaxReductionKind MinMaxKind;
483  };
484
485  // This POD struct holds information about the memory runtime legality
486  // check that a group of pointers do not overlap.
487  struct RuntimePointerCheck {
488    RuntimePointerCheck() : Need(false) {}
489
490    /// Reset the state of the pointer runtime information.
491    void reset() {
492      Need = false;
493      Pointers.clear();
494      Starts.clear();
495      Ends.clear();
496    }
497
498    /// Insert a pointer and calculate the start and end SCEVs.
499    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
500                unsigned DepSetId);
501
502    /// This flag indicates if we need to add the runtime check.
503    bool Need;
504    /// Holds the pointers that we need to check.
505    SmallVector<TrackingVH<Value>, 2> Pointers;
506    /// Holds the pointer value at the beginning of the loop.
507    SmallVector<const SCEV*, 2> Starts;
508    /// Holds the pointer value at the end of the loop.
509    SmallVector<const SCEV*, 2> Ends;
510    /// Holds the information if this pointer is used for writing to memory.
511    SmallVector<bool, 2> IsWritePtr;
512    /// Holds the id of the set of pointers that could be dependent because of a
513    /// shared underlying object.
514    SmallVector<unsigned, 2> DependencySetId;
515  };
516
517  /// A POD for saving information about induction variables.
518  struct InductionInfo {
519    InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
520    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
521    /// Start value.
522    TrackingVH<Value> StartValue;
523    /// Induction kind.
524    InductionKind IK;
525  };
526
527  /// ReductionList contains the reduction descriptors for all
528  /// of the reductions that were found in the loop.
529  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
530
531  /// InductionList saves induction variables and maps them to the
532  /// induction descriptor.
533  typedef MapVector<PHINode*, InductionInfo> InductionList;
534
535  /// Returns true if it is legal to vectorize this loop.
536  /// This does not mean that it is profitable to vectorize this
537  /// loop, only that it is legal to do so.
538  bool canVectorize();
539
540  /// Returns the Induction variable.
541  PHINode *getInduction() { return Induction; }
542
543  /// Returns the reduction variables found in the loop.
544  ReductionList *getReductionVars() { return &Reductions; }
545
546  /// Returns the induction variables found in the loop.
547  InductionList *getInductionVars() { return &Inductions; }
548
549  /// Returns the widest induction type.
550  Type *getWidestInductionType() { return WidestIndTy; }
551
552  /// Returns True if V is an induction variable in this loop.
553  bool isInductionVariable(const Value *V);
554
555  /// Return true if the block BB needs to be predicated in order for the loop
556  /// to be vectorized.
557  bool blockNeedsPredication(BasicBlock *BB);
558
559  /// Check if this  pointer is consecutive when vectorizing. This happens
560  /// when the last index of the GEP is the induction variable, or that the
561  /// pointer itself is an induction variable.
562  /// This check allows us to vectorize A[idx] into a wide load/store.
563  /// Returns:
564  /// 0 - Stride is unknown or non consecutive.
565  /// 1 - Address is consecutive.
566  /// -1 - Address is consecutive, and decreasing.
567  int isConsecutivePtr(Value *Ptr);
568
569  /// Returns true if the value V is uniform within the loop.
570  bool isUniform(Value *V);
571
572  /// Returns true if this instruction will remain scalar after vectorization.
573  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
574
575  /// Returns the information that we collected about runtime memory check.
576  RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
577
578  /// This function returns the identity element (or neutral element) for
579  /// the operation K.
580  static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
581
582  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
583
584private:
585  /// Check if a single basic block loop is vectorizable.
586  /// At this point we know that this is a loop with a constant trip count
587  /// and we only need to check individual instructions.
588  bool canVectorizeInstrs();
589
590  /// When we vectorize loops we may change the order in which
591  /// we read and write from memory. This method checks if it is
592  /// legal to vectorize the code, considering only memory constrains.
593  /// Returns true if the loop is vectorizable
594  bool canVectorizeMemory();
595
596  /// Return true if we can vectorize this loop using the IF-conversion
597  /// transformation.
598  bool canVectorizeWithIfConvert();
599
600  /// Collect the variables that need to stay uniform after vectorization.
601  void collectLoopUniforms();
602
603  /// Return true if all of the instructions in the block can be speculatively
604  /// executed. \p SafePtrs is a list of addresses that are known to be legal
605  /// and we know that we can read from them without segfault.
606  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs);
607
608  /// Returns True, if 'Phi' is the kind of reduction variable for type
609  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
610  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
611  /// Returns a struct describing if the instruction 'I' can be a reduction
612  /// variable of type 'Kind'. If the reduction is a min/max pattern of
613  /// select(icmp()) this function advances the instruction pointer 'I' from the
614  /// compare instruction to the select instruction and stores this pointer in
615  /// 'PatternLastInst' member of the returned struct.
616  ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
617                                     ReductionInstDesc &Desc);
618  /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
619  /// pattern corresponding to a min(X, Y) or max(X, Y).
620  static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
621                                                    ReductionInstDesc &Prev);
622  /// Returns the induction kind of Phi. This function may return NoInduction
623  /// if the PHI is not an induction variable.
624  InductionKind isInductionVariable(PHINode *Phi);
625
626  /// The loop that we evaluate.
627  Loop *TheLoop;
628  /// Scev analysis.
629  ScalarEvolution *SE;
630  /// DataLayout analysis.
631  DataLayout *DL;
632  /// Dominators.
633  DominatorTree *DT;
634  /// Target Library Info.
635  TargetLibraryInfo *TLI;
636
637  //  ---  vectorization state --- //
638
639  /// Holds the integer induction variable. This is the counter of the
640  /// loop.
641  PHINode *Induction;
642  /// Holds the reduction variables.
643  ReductionList Reductions;
644  /// Holds all of the induction variables that we found in the loop.
645  /// Notice that inductions don't need to start at zero and that induction
646  /// variables can be pointers.
647  InductionList Inductions;
648  /// Holds the widest induction type encountered.
649  Type *WidestIndTy;
650
651  /// Allowed outside users. This holds the reduction
652  /// vars which can be accessed from outside the loop.
653  SmallPtrSet<Value*, 4> AllowedExit;
654  /// This set holds the variables which are known to be uniform after
655  /// vectorization.
656  SmallPtrSet<Instruction*, 4> Uniforms;
657  /// We need to check that all of the pointers in this list are disjoint
658  /// at runtime.
659  RuntimePointerCheck PtrRtCheck;
660  /// Can we assume the absence of NaNs.
661  bool HasFunNoNaNAttr;
662
663  unsigned MaxSafeDepDistBytes;
664};
665
666/// LoopVectorizationCostModel - estimates the expected speedups due to
667/// vectorization.
668/// In many cases vectorization is not profitable. This can happen because of
669/// a number of reasons. In this class we mainly attempt to predict the
670/// expected speedup/slowdowns due to the supported instruction set. We use the
671/// TargetTransformInfo to query the different backends for the cost of
672/// different operations.
673class LoopVectorizationCostModel {
674public:
675  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
676                             LoopVectorizationLegality *Legal,
677                             const TargetTransformInfo &TTI,
678                             DataLayout *DL, const TargetLibraryInfo *TLI)
679      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
680
681  /// Information about vectorization costs
682  struct VectorizationFactor {
683    unsigned Width; // Vector width with best cost
684    unsigned Cost; // Cost of the loop with that width
685  };
686  /// \return The most profitable vectorization factor and the cost of that VF.
687  /// This method checks every power of two up to VF. If UserVF is not ZERO
688  /// then this vectorization factor will be selected if vectorization is
689  /// possible.
690  VectorizationFactor selectVectorizationFactor(bool OptForSize,
691                                                unsigned UserVF);
692
693  /// \return The size (in bits) of the widest type in the code that
694  /// needs to be vectorized. We ignore values that remain scalar such as
695  /// 64 bit loop indices.
696  unsigned getWidestType();
697
698  /// \return The most profitable unroll factor.
699  /// If UserUF is non-zero then this method finds the best unroll-factor
700  /// based on register pressure and other parameters.
701  /// VF and LoopCost are the selected vectorization factor and the cost of the
702  /// selected VF.
703  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
704                              unsigned LoopCost);
705
706  /// \brief A struct that represents some properties of the register usage
707  /// of a loop.
708  struct RegisterUsage {
709    /// Holds the number of loop invariant values that are used in the loop.
710    unsigned LoopInvariantRegs;
711    /// Holds the maximum number of concurrent live intervals in the loop.
712    unsigned MaxLocalUsers;
713    /// Holds the number of instructions in the loop.
714    unsigned NumInstructions;
715  };
716
717  /// \return  information about the register usage of the loop.
718  RegisterUsage calculateRegisterUsage();
719
720private:
721  /// Returns the expected execution cost. The unit of the cost does
722  /// not matter because we use the 'cost' units to compare different
723  /// vector widths. The cost that is returned is *not* normalized by
724  /// the factor width.
725  unsigned expectedCost(unsigned VF);
726
727  /// Returns the execution time cost of an instruction for a given vector
728  /// width. Vector width of one means scalar.
729  unsigned getInstructionCost(Instruction *I, unsigned VF);
730
731  /// A helper function for converting Scalar types to vector types.
732  /// If the incoming type is void, we return void. If the VF is 1, we return
733  /// the scalar type.
734  static Type* ToVectorTy(Type *Scalar, unsigned VF);
735
736  /// Returns whether the instruction is a load or store and will be a emitted
737  /// as a vector operation.
738  bool isConsecutiveLoadOrStore(Instruction *I);
739
740  /// The loop that we evaluate.
741  Loop *TheLoop;
742  /// Scev analysis.
743  ScalarEvolution *SE;
744  /// Loop Info analysis.
745  LoopInfo *LI;
746  /// Vectorization legality.
747  LoopVectorizationLegality *Legal;
748  /// Vector target information.
749  const TargetTransformInfo &TTI;
750  /// Target data layout information.
751  DataLayout *DL;
752  /// Target Library Info.
753  const TargetLibraryInfo *TLI;
754};
755
756/// Utility class for getting and setting loop vectorizer hints in the form
757/// of loop metadata.
758struct LoopVectorizeHints {
759  /// Vectorization width.
760  unsigned Width;
761  /// Vectorization unroll factor.
762  unsigned Unroll;
763
764  LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
765  : Width(VectorizationFactor)
766  , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
767  , LoopID(L->getLoopID()) {
768    getHints(L);
769    // The command line options override any loop metadata except for when
770    // width == 1 which is used to indicate the loop is already vectorized.
771    if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
772      Width = VectorizationFactor;
773    if (VectorizationUnroll.getNumOccurrences() > 0)
774      Unroll = VectorizationUnroll;
775
776    DEBUG(if (DisableUnrolling && Unroll == 1)
777            dbgs() << "LV: Unrolling disabled by the pass manager\n");
778  }
779
780  /// Return the loop vectorizer metadata prefix.
781  static StringRef Prefix() { return "llvm.vectorizer."; }
782
783  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
784    SmallVector<Value*, 2> Vals;
785    Vals.push_back(MDString::get(Context, Name));
786    Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
787    return MDNode::get(Context, Vals);
788  }
789
790  /// Mark the loop L as already vectorized by setting the width to 1.
791  void setAlreadyVectorized(Loop *L) {
792    LLVMContext &Context = L->getHeader()->getContext();
793
794    Width = 1;
795
796    // Create a new loop id with one more operand for the already_vectorized
797    // hint. If the loop already has a loop id then copy the existing operands.
798    SmallVector<Value*, 4> Vals(1);
799    if (LoopID)
800      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
801        Vals.push_back(LoopID->getOperand(i));
802
803    Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
804
805    MDNode *NewLoopID = MDNode::get(Context, Vals);
806    // Set operand 0 to refer to the loop id itself.
807    NewLoopID->replaceOperandWith(0, NewLoopID);
808
809    L->setLoopID(NewLoopID);
810    if (LoopID)
811      LoopID->replaceAllUsesWith(NewLoopID);
812
813    LoopID = NewLoopID;
814  }
815
816private:
817  MDNode *LoopID;
818
819  /// Find hints specified in the loop metadata.
820  void getHints(const Loop *L) {
821    if (!LoopID)
822      return;
823
824    // First operand should refer to the loop id itself.
825    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
826    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
827
828    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
829      const MDString *S = 0;
830      SmallVector<Value*, 4> Args;
831
832      // The expected hint is either a MDString or a MDNode with the first
833      // operand a MDString.
834      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
835        if (!MD || MD->getNumOperands() == 0)
836          continue;
837        S = dyn_cast<MDString>(MD->getOperand(0));
838        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
839          Args.push_back(MD->getOperand(i));
840      } else {
841        S = dyn_cast<MDString>(LoopID->getOperand(i));
842        assert(Args.size() == 0 && "too many arguments for MDString");
843      }
844
845      if (!S)
846        continue;
847
848      // Check if the hint starts with the vectorizer prefix.
849      StringRef Hint = S->getString();
850      if (!Hint.startswith(Prefix()))
851        continue;
852      // Remove the prefix.
853      Hint = Hint.substr(Prefix().size(), StringRef::npos);
854
855      if (Args.size() == 1)
856        getHint(Hint, Args[0]);
857    }
858  }
859
860  // Check string hint with one operand.
861  void getHint(StringRef Hint, Value *Arg) {
862    const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
863    if (!C) return;
864    unsigned Val = C->getZExtValue();
865
866    if (Hint == "width") {
867      if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
868        Width = Val;
869      else
870        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata");
871    } else if (Hint == "unroll") {
872      if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
873        Unroll = Val;
874      else
875        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata");
876    } else {
877      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
878    }
879  }
880};
881
882/// The LoopVectorize Pass.
883struct LoopVectorize : public LoopPass {
884  /// Pass identification, replacement for typeid
885  static char ID;
886
887  explicit LoopVectorize(bool NoUnrolling = false)
888    : LoopPass(ID), DisableUnrolling(NoUnrolling) {
889    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
890  }
891
892  ScalarEvolution *SE;
893  DataLayout *DL;
894  LoopInfo *LI;
895  TargetTransformInfo *TTI;
896  DominatorTree *DT;
897  TargetLibraryInfo *TLI;
898  bool DisableUnrolling;
899
900  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
901    // We only vectorize innermost loops.
902    if (!L->empty())
903      return false;
904
905    SE = &getAnalysis<ScalarEvolution>();
906    DL = getAnalysisIfAvailable<DataLayout>();
907    LI = &getAnalysis<LoopInfo>();
908    TTI = &getAnalysis<TargetTransformInfo>();
909    DT = &getAnalysis<DominatorTree>();
910    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
911
912    // If the target claims to have no vector registers don't attempt
913    // vectorization.
914    if (!TTI->getNumberOfRegisters(true))
915      return false;
916
917    if (DL == NULL) {
918      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
919      return false;
920    }
921
922    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
923          L->getHeader()->getParent()->getName() << "\"\n");
924
925    LoopVectorizeHints Hints(L, DisableUnrolling);
926
927    if (Hints.Width == 1 && Hints.Unroll == 1) {
928      DEBUG(dbgs() << "LV: Not vectorizing.\n");
929      return false;
930    }
931
932    // Check if it is legal to vectorize the loop.
933    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
934    if (!LVL.canVectorize()) {
935      DEBUG(dbgs() << "LV: Not vectorizing.\n");
936      return false;
937    }
938
939    // Use the cost model.
940    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
941
942    // Check the function attributes to find out if this function should be
943    // optimized for size.
944    Function *F = L->getHeader()->getParent();
945    Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
946    Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
947    unsigned FnIndex = AttributeSet::FunctionIndex;
948    bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
949    bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
950
951    if (NoFloat) {
952      DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
953            "attribute is used.\n");
954      return false;
955    }
956
957    // Select the optimal vectorization factor.
958    LoopVectorizationCostModel::VectorizationFactor VF;
959    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
960    // Select the unroll factor.
961    unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
962                                        VF.Cost);
963
964    if (VF.Width == 1) {
965      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
966    }
967
968    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
969          F->getParent()->getModuleIdentifier()<<"\n");
970    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
971
972    if (VF.Width == 1) {
973      if (UF == 1)
974        return false;
975      // We decided not to vectorize, but we may want to unroll.
976      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
977      Unroller.vectorize(&LVL);
978    } else {
979      // If we decided that it is *legal* to vectorize the loop then do it.
980      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
981      LB.vectorize(&LVL);
982    }
983
984    // Mark the loop as already vectorized to avoid vectorizing again.
985    Hints.setAlreadyVectorized(L);
986
987    DEBUG(verifyFunction(*L->getHeader()->getParent()));
988    return true;
989  }
990
991  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
992    LoopPass::getAnalysisUsage(AU);
993    AU.addRequiredID(LoopSimplifyID);
994    AU.addRequiredID(LCSSAID);
995    AU.addRequired<DominatorTree>();
996    AU.addRequired<LoopInfo>();
997    AU.addRequired<ScalarEvolution>();
998    AU.addRequired<TargetTransformInfo>();
999    AU.addPreserved<LoopInfo>();
1000    AU.addPreserved<DominatorTree>();
1001  }
1002
1003};
1004
1005} // end anonymous namespace
1006
1007//===----------------------------------------------------------------------===//
1008// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1009// LoopVectorizationCostModel.
1010//===----------------------------------------------------------------------===//
1011
1012void
1013LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
1014                                                       Loop *Lp, Value *Ptr,
1015                                                       bool WritePtr,
1016                                                       unsigned DepSetId) {
1017  const SCEV *Sc = SE->getSCEV(Ptr);
1018  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
1019  assert(AR && "Invalid addrec expression");
1020  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
1021  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
1022  Pointers.push_back(Ptr);
1023  Starts.push_back(AR->getStart());
1024  Ends.push_back(ScEnd);
1025  IsWritePtr.push_back(WritePtr);
1026  DependencySetId.push_back(DepSetId);
1027}
1028
1029Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1030  // Save the current insertion location.
1031  Instruction *Loc = Builder.GetInsertPoint();
1032
1033  // We need to place the broadcast of invariant variables outside the loop.
1034  Instruction *Instr = dyn_cast<Instruction>(V);
1035  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
1036  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1037
1038  // Place the code for broadcasting invariant variables in the new preheader.
1039  if (Invariant)
1040    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1041
1042  // Broadcast the scalar into all locations in the vector.
1043  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1044
1045  // Restore the builder insertion point.
1046  if (Invariant)
1047    Builder.SetInsertPoint(Loc);
1048
1049  return Shuf;
1050}
1051
1052Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
1053                                                 bool Negate) {
1054  assert(Val->getType()->isVectorTy() && "Must be a vector");
1055  assert(Val->getType()->getScalarType()->isIntegerTy() &&
1056         "Elem must be an integer");
1057  // Create the types.
1058  Type *ITy = Val->getType()->getScalarType();
1059  VectorType *Ty = cast<VectorType>(Val->getType());
1060  int VLen = Ty->getNumElements();
1061  SmallVector<Constant*, 8> Indices;
1062
1063  // Create a vector of consecutive numbers from zero to VF.
1064  for (int i = 0; i < VLen; ++i) {
1065    int64_t Idx = Negate ? (-i) : i;
1066    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
1067  }
1068
1069  // Add the consecutive indices to the vector value.
1070  Constant *Cv = ConstantVector::get(Indices);
1071  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1072  return Builder.CreateAdd(Val, Cv, "induction");
1073}
1074
1075int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
1076  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
1077  // Make sure that the pointer does not point to structs.
1078  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
1079    return 0;
1080
1081  // If this value is a pointer induction variable we know it is consecutive.
1082  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1083  if (Phi && Inductions.count(Phi)) {
1084    InductionInfo II = Inductions[Phi];
1085    if (IK_PtrInduction == II.IK)
1086      return 1;
1087    else if (IK_ReversePtrInduction == II.IK)
1088      return -1;
1089  }
1090
1091  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
1092  if (!Gep)
1093    return 0;
1094
1095  unsigned NumOperands = Gep->getNumOperands();
1096  Value *LastIndex = Gep->getOperand(NumOperands - 1);
1097
1098  Value *GpPtr = Gep->getPointerOperand();
1099  // If this GEP value is a consecutive pointer induction variable and all of
1100  // the indices are constant then we know it is consecutive. We can
1101  Phi = dyn_cast<PHINode>(GpPtr);
1102  if (Phi && Inductions.count(Phi)) {
1103
1104    // Make sure that the pointer does not point to structs.
1105    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
1106    if (GepPtrType->getElementType()->isAggregateType())
1107      return 0;
1108
1109    // Make sure that all of the index operands are loop invariant.
1110    for (unsigned i = 1; i < NumOperands; ++i)
1111      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1112        return 0;
1113
1114    InductionInfo II = Inductions[Phi];
1115    if (IK_PtrInduction == II.IK)
1116      return 1;
1117    else if (IK_ReversePtrInduction == II.IK)
1118      return -1;
1119  }
1120
1121  // Check that all of the gep indices are uniform except for the last.
1122  for (unsigned i = 0; i < NumOperands - 1; ++i)
1123    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1124      return 0;
1125
1126  // We can emit wide load/stores only if the last index is the induction
1127  // variable.
1128  const SCEV *Last = SE->getSCEV(LastIndex);
1129  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
1130    const SCEV *Step = AR->getStepRecurrence(*SE);
1131
1132    // The memory is consecutive because the last index is consecutive
1133    // and all other indices are loop invariant.
1134    if (Step->isOne())
1135      return 1;
1136    if (Step->isAllOnesValue())
1137      return -1;
1138  }
1139
1140  return 0;
1141}
1142
1143bool LoopVectorizationLegality::isUniform(Value *V) {
1144  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
1145}
1146
1147InnerLoopVectorizer::VectorParts&
1148InnerLoopVectorizer::getVectorValue(Value *V) {
1149  assert(V != Induction && "The new induction variable should not be used.");
1150  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1151
1152  // If we have this scalar in the map, return it.
1153  if (WidenMap.has(V))
1154    return WidenMap.get(V);
1155
1156  // If this scalar is unknown, assume that it is a constant or that it is
1157  // loop invariant. Broadcast V and save the value for future uses.
1158  Value *B = getBroadcastInstrs(V);
1159  return WidenMap.splat(V, B);
1160}
1161
1162Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
1163  assert(Vec->getType()->isVectorTy() && "Invalid type");
1164  SmallVector<Constant*, 8> ShuffleMask;
1165  for (unsigned i = 0; i < VF; ++i)
1166    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1167
1168  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1169                                     ConstantVector::get(ShuffleMask),
1170                                     "reverse");
1171}
1172
1173
1174void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
1175                                             LoopVectorizationLegality *Legal) {
1176  // Attempt to issue a wide load.
1177  LoadInst *LI = dyn_cast<LoadInst>(Instr);
1178  StoreInst *SI = dyn_cast<StoreInst>(Instr);
1179
1180  assert((LI || SI) && "Invalid Load/Store instruction");
1181
1182  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
1183  Type *DataTy = VectorType::get(ScalarDataTy, VF);
1184  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
1185  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
1186  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
1187  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
1188  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
1189
1190  if (ScalarAllocatedSize != VectorElementSize)
1191    return scalarizeInstruction(Instr);
1192
1193  // If the pointer is loop invariant or if it is non consecutive,
1194  // scalarize the load.
1195  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
1196  bool Reverse = ConsecutiveStride < 0;
1197  bool UniformLoad = LI && Legal->isUniform(Ptr);
1198  if (!ConsecutiveStride || UniformLoad)
1199    return scalarizeInstruction(Instr);
1200
1201  Constant *Zero = Builder.getInt32(0);
1202  VectorParts &Entry = WidenMap.get(Instr);
1203
1204  // Handle consecutive loads/stores.
1205  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1206  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
1207    setDebugLocFromInst(Builder, Gep);
1208    Value *PtrOperand = Gep->getPointerOperand();
1209    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
1210    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
1211
1212    // Create the new GEP with the new induction variable.
1213    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1214    Gep2->setOperand(0, FirstBasePtr);
1215    Gep2->setName("gep.indvar.base");
1216    Ptr = Builder.Insert(Gep2);
1217  } else if (Gep) {
1218    setDebugLocFromInst(Builder, Gep);
1219    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
1220                               OrigLoop) && "Base ptr must be invariant");
1221
1222    // The last index does not have to be the induction. It can be
1223    // consecutive and be a function of the index. For example A[I+1];
1224    unsigned NumOperands = Gep->getNumOperands();
1225    unsigned LastOperand = NumOperands - 1;
1226    // Create the new GEP with the new induction variable.
1227    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1228
1229    for (unsigned i = 0; i < NumOperands; ++i) {
1230      Value *GepOperand = Gep->getOperand(i);
1231      Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
1232
1233      // Update last index or loop invariant instruction anchored in loop.
1234      if (i == LastOperand ||
1235          (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
1236        assert((i == LastOperand ||
1237               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
1238               "Must be last index or loop invariant");
1239
1240        VectorParts &GEPParts = getVectorValue(GepOperand);
1241        Value *Index = GEPParts[0];
1242        Index = Builder.CreateExtractElement(Index, Zero);
1243        Gep2->setOperand(i, Index);
1244        Gep2->setName("gep.indvar.idx");
1245      }
1246    }
1247    Ptr = Builder.Insert(Gep2);
1248  } else {
1249    // Use the induction element ptr.
1250    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
1251    setDebugLocFromInst(Builder, Ptr);
1252    VectorParts &PtrVal = getVectorValue(Ptr);
1253    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
1254  }
1255
1256  // Handle Stores:
1257  if (SI) {
1258    assert(!Legal->isUniform(SI->getPointerOperand()) &&
1259           "We do not allow storing to uniform addresses");
1260    setDebugLocFromInst(Builder, SI);
1261    // We don't want to update the value in the map as it might be used in
1262    // another expression. So don't use a reference type for "StoredVal".
1263    VectorParts StoredVal = getVectorValue(SI->getValueOperand());
1264
1265    for (unsigned Part = 0; Part < UF; ++Part) {
1266      // Calculate the pointer for the specific unroll-part.
1267      Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
1268
1269      if (Reverse) {
1270        // If we store to reverse consecutive memory locations then we need
1271        // to reverse the order of elements in the stored value.
1272        StoredVal[Part] = reverseVector(StoredVal[Part]);
1273        // If the address is consecutive but reversed, then the
1274        // wide store needs to start at the last vector element.
1275        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1276        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1277      }
1278
1279      Value *VecPtr = Builder.CreateBitCast(PartPtr,
1280                                            DataTy->getPointerTo(AddressSpace));
1281      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
1282    }
1283    return;
1284  }
1285
1286  // Handle loads.
1287  assert(LI && "Must have a load instruction");
1288  setDebugLocFromInst(Builder, LI);
1289  for (unsigned Part = 0; Part < UF; ++Part) {
1290    // Calculate the pointer for the specific unroll-part.
1291    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
1292
1293    if (Reverse) {
1294      // If the address is consecutive but reversed, then the
1295      // wide store needs to start at the last vector element.
1296      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1297      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1298    }
1299
1300    Value *VecPtr = Builder.CreateBitCast(PartPtr,
1301                                          DataTy->getPointerTo(AddressSpace));
1302    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
1303    cast<LoadInst>(LI)->setAlignment(Alignment);
1304    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
1305  }
1306}
1307
1308void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
1309  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
1310  // Holds vector parameters or scalars, in case of uniform vals.
1311  SmallVector<VectorParts, 4> Params;
1312
1313  setDebugLocFromInst(Builder, Instr);
1314
1315  // Find all of the vectorized parameters.
1316  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1317    Value *SrcOp = Instr->getOperand(op);
1318
1319    // If we are accessing the old induction variable, use the new one.
1320    if (SrcOp == OldInduction) {
1321      Params.push_back(getVectorValue(SrcOp));
1322      continue;
1323    }
1324
1325    // Try using previously calculated values.
1326    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
1327
1328    // If the src is an instruction that appeared earlier in the basic block
1329    // then it should already be vectorized.
1330    if (SrcInst && OrigLoop->contains(SrcInst)) {
1331      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
1332      // The parameter is a vector value from earlier.
1333      Params.push_back(WidenMap.get(SrcInst));
1334    } else {
1335      // The parameter is a scalar from outside the loop. Maybe even a constant.
1336      VectorParts Scalars;
1337      Scalars.append(UF, SrcOp);
1338      Params.push_back(Scalars);
1339    }
1340  }
1341
1342  assert(Params.size() == Instr->getNumOperands() &&
1343         "Invalid number of operands");
1344
1345  // Does this instruction return a value ?
1346  bool IsVoidRetTy = Instr->getType()->isVoidTy();
1347
1348  Value *UndefVec = IsVoidRetTy ? 0 :
1349    UndefValue::get(VectorType::get(Instr->getType(), VF));
1350  // Create a new entry in the WidenMap and initialize it to Undef or Null.
1351  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
1352
1353  // For each vector unroll 'part':
1354  for (unsigned Part = 0; Part < UF; ++Part) {
1355    // For each scalar that we create:
1356    for (unsigned Width = 0; Width < VF; ++Width) {
1357      Instruction *Cloned = Instr->clone();
1358      if (!IsVoidRetTy)
1359        Cloned->setName(Instr->getName() + ".cloned");
1360      // Replace the operands of the cloned instructions with extracted scalars.
1361      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1362        Value *Op = Params[op][Part];
1363        // Param is a vector. Need to extract the right lane.
1364        if (Op->getType()->isVectorTy())
1365          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
1366        Cloned->setOperand(op, Op);
1367      }
1368
1369      // Place the cloned scalar in the new loop.
1370      Builder.Insert(Cloned);
1371
1372      // If the original scalar returns a value we need to place it in a vector
1373      // so that future users will be able to use it.
1374      if (!IsVoidRetTy)
1375        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
1376                                                       Builder.getInt32(Width));
1377    }
1378  }
1379}
1380
1381Instruction *
1382InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
1383                                     Instruction *Loc) {
1384  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
1385  Legal->getRuntimePointerCheck();
1386
1387  if (!PtrRtCheck->Need)
1388    return NULL;
1389
1390  unsigned NumPointers = PtrRtCheck->Pointers.size();
1391  SmallVector<TrackingVH<Value> , 2> Starts;
1392  SmallVector<TrackingVH<Value> , 2> Ends;
1393
1394  SCEVExpander Exp(*SE, "induction");
1395
1396  // Use this type for pointer arithmetic.
1397  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
1398
1399  for (unsigned i = 0; i < NumPointers; ++i) {
1400    Value *Ptr = PtrRtCheck->Pointers[i];
1401    const SCEV *Sc = SE->getSCEV(Ptr);
1402
1403    if (SE->isLoopInvariant(Sc, OrigLoop)) {
1404      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
1405            *Ptr <<"\n");
1406      Starts.push_back(Ptr);
1407      Ends.push_back(Ptr);
1408    } else {
1409      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
1410
1411      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
1412      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
1413      Starts.push_back(Start);
1414      Ends.push_back(End);
1415    }
1416  }
1417
1418  IRBuilder<> ChkBuilder(Loc);
1419  // Our instructions might fold to a constant.
1420  Value *MemoryRuntimeCheck = 0;
1421  for (unsigned i = 0; i < NumPointers; ++i) {
1422    for (unsigned j = i+1; j < NumPointers; ++j) {
1423      // No need to check if two readonly pointers intersect.
1424      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
1425        continue;
1426
1427      // Only need to check pointers between two different dependency sets.
1428      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
1429       continue;
1430
1431      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
1432      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
1433      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
1434      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
1435
1436      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
1437      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
1438      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
1439      if (MemoryRuntimeCheck)
1440        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
1441                                         "conflict.rdx");
1442      MemoryRuntimeCheck = IsConflict;
1443    }
1444  }
1445
1446  // We have to do this trickery because the IRBuilder might fold the check to a
1447  // constant expression in which case there is no Instruction anchored in a
1448  // the block.
1449  LLVMContext &Ctx = Loc->getContext();
1450  Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
1451                                                  ConstantInt::getTrue(Ctx));
1452  ChkBuilder.Insert(Check, "memcheck.conflict");
1453  return Check;
1454}
1455
1456void
1457InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
1458  /*
1459   In this function we generate a new loop. The new loop will contain
1460   the vectorized instructions while the old loop will continue to run the
1461   scalar remainder.
1462
1463       [ ] <-- vector loop bypass (may consist of multiple blocks).
1464     /  |
1465    /   v
1466   |   [ ]     <-- vector pre header.
1467   |    |
1468   |    v
1469   |   [  ] \
1470   |   [  ]_|   <-- vector loop.
1471   |    |
1472    \   v
1473      >[ ]   <--- middle-block.
1474     /  |
1475    /   v
1476   |   [ ]     <--- new preheader.
1477   |    |
1478   |    v
1479   |   [ ] \
1480   |   [ ]_|   <-- old scalar loop to handle remainder.
1481    \   |
1482     \  v
1483      >[ ]     <-- exit block.
1484   ...
1485   */
1486
1487  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
1488  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
1489  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
1490  assert(ExitBlock && "Must have an exit block");
1491
1492  // Some loops have a single integer induction variable, while other loops
1493  // don't. One example is c++ iterators that often have multiple pointer
1494  // induction variables. In the code below we also support a case where we
1495  // don't have a single induction variable.
1496  OldInduction = Legal->getInduction();
1497  Type *IdxTy = Legal->getWidestInductionType();
1498
1499  // Find the loop boundaries.
1500  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
1501  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
1502
1503  // Get the total trip count from the count by adding 1.
1504  ExitCount = SE->getAddExpr(ExitCount,
1505                             SE->getConstant(ExitCount->getType(), 1));
1506
1507  // Expand the trip count and place the new instructions in the preheader.
1508  // Notice that the pre-header does not change, only the loop body.
1509  SCEVExpander Exp(*SE, "induction");
1510
1511  // Count holds the overall loop count (N).
1512  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
1513                                   BypassBlock->getTerminator());
1514
1515  // The loop index does not have to start at Zero. Find the original start
1516  // value from the induction PHI node. If we don't have an induction variable
1517  // then we know that it starts at zero.
1518  Builder.SetInsertPoint(BypassBlock->getTerminator());
1519  Value *StartIdx = ExtendedIdx = OldInduction ?
1520    Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
1521                       IdxTy):
1522    ConstantInt::get(IdxTy, 0);
1523
1524  assert(BypassBlock && "Invalid loop structure");
1525  LoopBypassBlocks.push_back(BypassBlock);
1526
1527  // Split the single block loop into the two loop structure described above.
1528  BasicBlock *VectorPH =
1529  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
1530  BasicBlock *VecBody =
1531  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
1532  BasicBlock *MiddleBlock =
1533  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
1534  BasicBlock *ScalarPH =
1535  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
1536
1537  // Create and register the new vector loop.
1538  Loop* Lp = new Loop();
1539  Loop *ParentLoop = OrigLoop->getParentLoop();
1540
1541  // Insert the new loop into the loop nest and register the new basic blocks
1542  // before calling any utilities such as SCEV that require valid LoopInfo.
1543  if (ParentLoop) {
1544    ParentLoop->addChildLoop(Lp);
1545    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
1546    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
1547    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
1548  } else {
1549    LI->addTopLevelLoop(Lp);
1550  }
1551  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
1552
1553  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
1554  // inside the loop.
1555  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
1556
1557  // Generate the induction variable.
1558  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
1559  Induction = Builder.CreatePHI(IdxTy, 2, "index");
1560  // The loop step is equal to the vectorization factor (num of SIMD elements)
1561  // times the unroll factor (num of SIMD instructions).
1562  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
1563
1564  // This is the IR builder that we use to add all of the logic for bypassing
1565  // the new vector loop.
1566  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
1567  setDebugLocFromInst(BypassBuilder,
1568                      getDebugLocFromInstOrOperands(OldInduction));
1569
1570  // We may need to extend the index in case there is a type mismatch.
1571  // We know that the count starts at zero and does not overflow.
1572  if (Count->getType() != IdxTy) {
1573    // The exit count can be of pointer type. Convert it to the correct
1574    // integer type.
1575    if (ExitCount->getType()->isPointerTy())
1576      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
1577    else
1578      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
1579  }
1580
1581  // Add the start index to the loop count to get the new end index.
1582  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
1583
1584  // Now we need to generate the expression for N - (N % VF), which is
1585  // the part that the vectorized body will execute.
1586  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
1587  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
1588  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
1589                                                     "end.idx.rnd.down");
1590
1591  // Now, compare the new count to zero. If it is zero skip the vector loop and
1592  // jump to the scalar loop.
1593  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
1594                                          "cmp.zero");
1595
1596  BasicBlock *LastBypassBlock = BypassBlock;
1597
1598  // Generate the code that checks in runtime if arrays overlap. We put the
1599  // checks into a separate block to make the more common case of few elements
1600  // faster.
1601  Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
1602                                                 BypassBlock->getTerminator());
1603  if (MemRuntimeCheck) {
1604    // Create a new block containing the memory check.
1605    BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
1606                                                          "vector.memcheck");
1607    if (ParentLoop)
1608      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
1609    LoopBypassBlocks.push_back(CheckBlock);
1610
1611    // Replace the branch into the memory check block with a conditional branch
1612    // for the "few elements case".
1613    Instruction *OldTerm = BypassBlock->getTerminator();
1614    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
1615    OldTerm->eraseFromParent();
1616
1617    Cmp = MemRuntimeCheck;
1618    LastBypassBlock = CheckBlock;
1619  }
1620
1621  LastBypassBlock->getTerminator()->eraseFromParent();
1622  BranchInst::Create(MiddleBlock, VectorPH, Cmp,
1623                     LastBypassBlock);
1624
1625  // We are going to resume the execution of the scalar loop.
1626  // Go over all of the induction variables that we found and fix the
1627  // PHIs that are left in the scalar version of the loop.
1628  // The starting values of PHI nodes depend on the counter of the last
1629  // iteration in the vectorized loop.
1630  // If we come from a bypass edge then we need to start from the original
1631  // start value.
1632
1633  // This variable saves the new starting index for the scalar loop.
1634  PHINode *ResumeIndex = 0;
1635  LoopVectorizationLegality::InductionList::iterator I, E;
1636  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
1637  // Set builder to point to last bypass block.
1638  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
1639  for (I = List->begin(), E = List->end(); I != E; ++I) {
1640    PHINode *OrigPhi = I->first;
1641    LoopVectorizationLegality::InductionInfo II = I->second;
1642
1643    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
1644    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
1645                                         MiddleBlock->getTerminator());
1646    // We might have extended the type of the induction variable but we need a
1647    // truncated version for the scalar loop.
1648    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
1649      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
1650                      MiddleBlock->getTerminator()) : 0;
1651
1652    Value *EndValue = 0;
1653    switch (II.IK) {
1654    case LoopVectorizationLegality::IK_NoInduction:
1655      llvm_unreachable("Unknown induction");
1656    case LoopVectorizationLegality::IK_IntInduction: {
1657      // Handle the integer induction counter.
1658      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
1659
1660      // We have the canonical induction variable.
1661      if (OrigPhi == OldInduction) {
1662        // Create a truncated version of the resume value for the scalar loop,
1663        // we might have promoted the type to a larger width.
1664        EndValue =
1665          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
1666        // The new PHI merges the original incoming value, in case of a bypass,
1667        // or the value at the end of the vectorized loop.
1668        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
1669          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
1670        TruncResumeVal->addIncoming(EndValue, VecBody);
1671
1672        // We know what the end value is.
1673        EndValue = IdxEndRoundDown;
1674        // We also know which PHI node holds it.
1675        ResumeIndex = ResumeVal;
1676        break;
1677      }
1678
1679      // Not the canonical induction variable - add the vector loop count to the
1680      // start value.
1681      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
1682                                                   II.StartValue->getType(),
1683                                                   "cast.crd");
1684      EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
1685      break;
1686    }
1687    case LoopVectorizationLegality::IK_ReverseIntInduction: {
1688      // Convert the CountRoundDown variable to the PHI size.
1689      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
1690                                                   II.StartValue->getType(),
1691                                                   "cast.crd");
1692      // Handle reverse integer induction counter.
1693      EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
1694      break;
1695    }
1696    case LoopVectorizationLegality::IK_PtrInduction: {
1697      // For pointer induction variables, calculate the offset using
1698      // the end index.
1699      EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
1700                                         "ptr.ind.end");
1701      break;
1702    }
1703    case LoopVectorizationLegality::IK_ReversePtrInduction: {
1704      // The value at the end of the loop for the reverse pointer is calculated
1705      // by creating a GEP with a negative index starting from the start value.
1706      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
1707      Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
1708                                              "rev.ind.end");
1709      EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
1710                                         "rev.ptr.ind.end");
1711      break;
1712    }
1713    }// end of case
1714
1715    // The new PHI merges the original incoming value, in case of a bypass,
1716    // or the value at the end of the vectorized loop.
1717    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
1718      if (OrigPhi == OldInduction)
1719        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
1720      else
1721        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
1722    }
1723    ResumeVal->addIncoming(EndValue, VecBody);
1724
1725    // Fix the scalar body counter (PHI node).
1726    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
1727    // The old inductions phi node in the scalar body needs the truncated value.
1728    if (OrigPhi == OldInduction)
1729      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
1730    else
1731      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
1732  }
1733
1734  // If we are generating a new induction variable then we also need to
1735  // generate the code that calculates the exit value. This value is not
1736  // simply the end of the counter because we may skip the vectorized body
1737  // in case of a runtime check.
1738  if (!OldInduction){
1739    assert(!ResumeIndex && "Unexpected resume value found");
1740    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
1741                                  MiddleBlock->getTerminator());
1742    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
1743      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
1744    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
1745  }
1746
1747  // Make sure that we found the index where scalar loop needs to continue.
1748  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
1749         "Invalid resume Index");
1750
1751  // Add a check in the middle block to see if we have completed
1752  // all of the iterations in the first vector loop.
1753  // If (N - N%VF) == N, then we *don't* need to run the remainder.
1754  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
1755                                ResumeIndex, "cmp.n",
1756                                MiddleBlock->getTerminator());
1757
1758  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
1759  // Remove the old terminator.
1760  MiddleBlock->getTerminator()->eraseFromParent();
1761
1762  // Create i+1 and fill the PHINode.
1763  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
1764  Induction->addIncoming(StartIdx, VectorPH);
1765  Induction->addIncoming(NextIdx, VecBody);
1766  // Create the compare.
1767  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
1768  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
1769
1770  // Now we have two terminators. Remove the old one from the block.
1771  VecBody->getTerminator()->eraseFromParent();
1772
1773  // Get ready to start creating new instructions into the vectorized body.
1774  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
1775
1776  // Save the state.
1777  LoopVectorPreHeader = VectorPH;
1778  LoopScalarPreHeader = ScalarPH;
1779  LoopMiddleBlock = MiddleBlock;
1780  LoopExitBlock = ExitBlock;
1781  LoopVectorBody = VecBody;
1782  LoopScalarBody = OldBasicBlock;
1783}
1784
1785/// This function returns the identity element (or neutral element) for
1786/// the operation K.
1787Constant*
1788LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
1789  switch (K) {
1790  case RK_IntegerXor:
1791  case RK_IntegerAdd:
1792  case RK_IntegerOr:
1793    // Adding, Xoring, Oring zero to a number does not change it.
1794    return ConstantInt::get(Tp, 0);
1795  case RK_IntegerMult:
1796    // Multiplying a number by 1 does not change it.
1797    return ConstantInt::get(Tp, 1);
1798  case RK_IntegerAnd:
1799    // AND-ing a number with an all-1 value does not change it.
1800    return ConstantInt::get(Tp, -1, true);
1801  case  RK_FloatMult:
1802    // Multiplying a number by 1 does not change it.
1803    return ConstantFP::get(Tp, 1.0L);
1804  case  RK_FloatAdd:
1805    // Adding zero to a number does not change it.
1806    return ConstantFP::get(Tp, 0.0L);
1807  default:
1808    llvm_unreachable("Unknown reduction kind");
1809  }
1810}
1811
1812static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
1813                                              Intrinsic::ID ValidIntrinsicID) {
1814  if (I.getNumArgOperands() != 1 ||
1815      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
1816      I.getType() != I.getArgOperand(0)->getType() ||
1817      !I.onlyReadsMemory())
1818    return Intrinsic::not_intrinsic;
1819
1820  return ValidIntrinsicID;
1821}
1822
1823static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
1824                                               Intrinsic::ID ValidIntrinsicID) {
1825  if (I.getNumArgOperands() != 2 ||
1826      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
1827      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
1828      I.getType() != I.getArgOperand(0)->getType() ||
1829      I.getType() != I.getArgOperand(1)->getType() ||
1830      !I.onlyReadsMemory())
1831    return Intrinsic::not_intrinsic;
1832
1833  return ValidIntrinsicID;
1834}
1835
1836
1837static Intrinsic::ID
1838getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
1839  // If we have an intrinsic call, check if it is trivially vectorizable.
1840  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
1841    switch (II->getIntrinsicID()) {
1842    case Intrinsic::sqrt:
1843    case Intrinsic::sin:
1844    case Intrinsic::cos:
1845    case Intrinsic::exp:
1846    case Intrinsic::exp2:
1847    case Intrinsic::log:
1848    case Intrinsic::log10:
1849    case Intrinsic::log2:
1850    case Intrinsic::fabs:
1851    case Intrinsic::copysign:
1852    case Intrinsic::floor:
1853    case Intrinsic::ceil:
1854    case Intrinsic::trunc:
1855    case Intrinsic::rint:
1856    case Intrinsic::nearbyint:
1857    case Intrinsic::round:
1858    case Intrinsic::pow:
1859    case Intrinsic::fma:
1860    case Intrinsic::fmuladd:
1861    case Intrinsic::lifetime_start:
1862    case Intrinsic::lifetime_end:
1863      return II->getIntrinsicID();
1864    default:
1865      return Intrinsic::not_intrinsic;
1866    }
1867  }
1868
1869  if (!TLI)
1870    return Intrinsic::not_intrinsic;
1871
1872  LibFunc::Func Func;
1873  Function *F = CI->getCalledFunction();
1874  // We're going to make assumptions on the semantics of the functions, check
1875  // that the target knows that it's available in this environment and it does
1876  // not have local linkage.
1877  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
1878    return Intrinsic::not_intrinsic;
1879
1880  // Otherwise check if we have a call to a function that can be turned into a
1881  // vector intrinsic.
1882  switch (Func) {
1883  default:
1884    break;
1885  case LibFunc::sin:
1886  case LibFunc::sinf:
1887  case LibFunc::sinl:
1888    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
1889  case LibFunc::cos:
1890  case LibFunc::cosf:
1891  case LibFunc::cosl:
1892    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
1893  case LibFunc::exp:
1894  case LibFunc::expf:
1895  case LibFunc::expl:
1896    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
1897  case LibFunc::exp2:
1898  case LibFunc::exp2f:
1899  case LibFunc::exp2l:
1900    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
1901  case LibFunc::log:
1902  case LibFunc::logf:
1903  case LibFunc::logl:
1904    return checkUnaryFloatSignature(*CI, Intrinsic::log);
1905  case LibFunc::log10:
1906  case LibFunc::log10f:
1907  case LibFunc::log10l:
1908    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
1909  case LibFunc::log2:
1910  case LibFunc::log2f:
1911  case LibFunc::log2l:
1912    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
1913  case LibFunc::fabs:
1914  case LibFunc::fabsf:
1915  case LibFunc::fabsl:
1916    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
1917  case LibFunc::copysign:
1918  case LibFunc::copysignf:
1919  case LibFunc::copysignl:
1920    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
1921  case LibFunc::floor:
1922  case LibFunc::floorf:
1923  case LibFunc::floorl:
1924    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
1925  case LibFunc::ceil:
1926  case LibFunc::ceilf:
1927  case LibFunc::ceill:
1928    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
1929  case LibFunc::trunc:
1930  case LibFunc::truncf:
1931  case LibFunc::truncl:
1932    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
1933  case LibFunc::rint:
1934  case LibFunc::rintf:
1935  case LibFunc::rintl:
1936    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
1937  case LibFunc::nearbyint:
1938  case LibFunc::nearbyintf:
1939  case LibFunc::nearbyintl:
1940    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
1941  case LibFunc::round:
1942  case LibFunc::roundf:
1943  case LibFunc::roundl:
1944    return checkUnaryFloatSignature(*CI, Intrinsic::round);
1945  case LibFunc::pow:
1946  case LibFunc::powf:
1947  case LibFunc::powl:
1948    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
1949  }
1950
1951  return Intrinsic::not_intrinsic;
1952}
1953
1954/// This function translates the reduction kind to an LLVM binary operator.
1955static unsigned
1956getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
1957  switch (Kind) {
1958    case LoopVectorizationLegality::RK_IntegerAdd:
1959      return Instruction::Add;
1960    case LoopVectorizationLegality::RK_IntegerMult:
1961      return Instruction::Mul;
1962    case LoopVectorizationLegality::RK_IntegerOr:
1963      return Instruction::Or;
1964    case LoopVectorizationLegality::RK_IntegerAnd:
1965      return Instruction::And;
1966    case LoopVectorizationLegality::RK_IntegerXor:
1967      return Instruction::Xor;
1968    case LoopVectorizationLegality::RK_FloatMult:
1969      return Instruction::FMul;
1970    case LoopVectorizationLegality::RK_FloatAdd:
1971      return Instruction::FAdd;
1972    case LoopVectorizationLegality::RK_IntegerMinMax:
1973      return Instruction::ICmp;
1974    case LoopVectorizationLegality::RK_FloatMinMax:
1975      return Instruction::FCmp;
1976    default:
1977      llvm_unreachable("Unknown reduction operation");
1978  }
1979}
1980
1981Value *createMinMaxOp(IRBuilder<> &Builder,
1982                      LoopVectorizationLegality::MinMaxReductionKind RK,
1983                      Value *Left,
1984                      Value *Right) {
1985  CmpInst::Predicate P = CmpInst::ICMP_NE;
1986  switch (RK) {
1987  default:
1988    llvm_unreachable("Unknown min/max reduction kind");
1989  case LoopVectorizationLegality::MRK_UIntMin:
1990    P = CmpInst::ICMP_ULT;
1991    break;
1992  case LoopVectorizationLegality::MRK_UIntMax:
1993    P = CmpInst::ICMP_UGT;
1994    break;
1995  case LoopVectorizationLegality::MRK_SIntMin:
1996    P = CmpInst::ICMP_SLT;
1997    break;
1998  case LoopVectorizationLegality::MRK_SIntMax:
1999    P = CmpInst::ICMP_SGT;
2000    break;
2001  case LoopVectorizationLegality::MRK_FloatMin:
2002    P = CmpInst::FCMP_OLT;
2003    break;
2004  case LoopVectorizationLegality::MRK_FloatMax:
2005    P = CmpInst::FCMP_OGT;
2006    break;
2007  }
2008
2009  Value *Cmp;
2010  if (RK == LoopVectorizationLegality::MRK_FloatMin ||
2011      RK == LoopVectorizationLegality::MRK_FloatMax)
2012    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
2013  else
2014    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
2015
2016  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
2017  return Select;
2018}
2019
2020void
2021InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
2022  //===------------------------------------------------===//
2023  //
2024  // Notice: any optimization or new instruction that go
2025  // into the code below should be also be implemented in
2026  // the cost-model.
2027  //
2028  //===------------------------------------------------===//
2029  Constant *Zero = Builder.getInt32(0);
2030
2031  // In order to support reduction variables we need to be able to vectorize
2032  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
2033  // stages. First, we create a new vector PHI node with no incoming edges.
2034  // We use this value when we vectorize all of the instructions that use the
2035  // PHI. Next, after all of the instructions in the block are complete we
2036  // add the new incoming edges to the PHI. At this point all of the
2037  // instructions in the basic block are vectorized, so we can use them to
2038  // construct the PHI.
2039  PhiVector RdxPHIsToFix;
2040
2041  // Scan the loop in a topological order to ensure that defs are vectorized
2042  // before users.
2043  LoopBlocksDFS DFS(OrigLoop);
2044  DFS.perform(LI);
2045
2046  // Vectorize all of the blocks in the original loop.
2047  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
2048       be = DFS.endRPO(); bb != be; ++bb)
2049    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
2050
2051  // At this point every instruction in the original loop is widened to
2052  // a vector form. We are almost done. Now, we need to fix the PHI nodes
2053  // that we vectorized. The PHI nodes are currently empty because we did
2054  // not want to introduce cycles. Notice that the remaining PHI nodes
2055  // that we need to fix are reduction variables.
2056
2057  // Create the 'reduced' values for each of the induction vars.
2058  // The reduced values are the vector values that we scalarize and combine
2059  // after the loop is finished.
2060  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
2061       it != e; ++it) {
2062    PHINode *RdxPhi = *it;
2063    assert(RdxPhi && "Unable to recover vectorized PHI");
2064
2065    // Find the reduction variable descriptor.
2066    assert(Legal->getReductionVars()->count(RdxPhi) &&
2067           "Unable to find the reduction variable");
2068    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
2069    (*Legal->getReductionVars())[RdxPhi];
2070
2071    setDebugLocFromInst(Builder, RdxDesc.StartValue);
2072
2073    // We need to generate a reduction vector from the incoming scalar.
2074    // To do so, we need to generate the 'identity' vector and overide
2075    // one of the elements with the incoming scalar reduction. We need
2076    // to do it in the vector-loop preheader.
2077    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
2078
2079    // This is the vector-clone of the value that leaves the loop.
2080    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
2081    Type *VecTy = VectorExit[0]->getType();
2082
2083    // Find the reduction identity variable. Zero for addition, or, xor,
2084    // one for multiplication, -1 for And.
2085    Value *Identity;
2086    Value *VectorStart;
2087    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
2088        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
2089      // MinMax reduction have the start value as their identify.
2090      if (VF == 1) {
2091        VectorStart = Identity = RdxDesc.StartValue;
2092      } else {
2093        VectorStart = Identity = Builder.CreateVectorSplat(VF,
2094                                                           RdxDesc.StartValue,
2095                                                           "minmax.ident");
2096      }
2097    } else {
2098      // Handle other reduction kinds:
2099      Constant *Iden =
2100      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
2101                                                      VecTy->getScalarType());
2102      if (VF == 1) {
2103        Identity = Iden;
2104        // This vector is the Identity vector where the first element is the
2105        // incoming scalar reduction.
2106        VectorStart = RdxDesc.StartValue;
2107      } else {
2108        Identity = ConstantVector::getSplat(VF, Iden);
2109
2110        // This vector is the Identity vector where the first element is the
2111        // incoming scalar reduction.
2112        VectorStart = Builder.CreateInsertElement(Identity,
2113                                                  RdxDesc.StartValue, Zero);
2114      }
2115    }
2116
2117    // Fix the vector-loop phi.
2118    // We created the induction variable so we know that the
2119    // preheader is the first entry.
2120    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
2121
2122    // Reductions do not have to start at zero. They can start with
2123    // any loop invariant values.
2124    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
2125    BasicBlock *Latch = OrigLoop->getLoopLatch();
2126    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
2127    VectorParts &Val = getVectorValue(LoopVal);
2128    for (unsigned part = 0; part < UF; ++part) {
2129      // Make sure to add the reduction stat value only to the
2130      // first unroll part.
2131      Value *StartVal = (part == 0) ? VectorStart : Identity;
2132      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
2133      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
2134    }
2135
2136    // Before each round, move the insertion point right between
2137    // the PHIs and the values we are going to write.
2138    // This allows us to write both PHINodes and the extractelement
2139    // instructions.
2140    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
2141
2142    VectorParts RdxParts;
2143    setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
2144    for (unsigned part = 0; part < UF; ++part) {
2145      // This PHINode contains the vectorized reduction variable, or
2146      // the initial value vector, if we bypass the vector loop.
2147      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
2148      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
2149      Value *StartVal = (part == 0) ? VectorStart : Identity;
2150      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
2151        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
2152      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
2153      RdxParts.push_back(NewPhi);
2154    }
2155
2156    // Reduce all of the unrolled parts into a single vector.
2157    Value *ReducedPartRdx = RdxParts[0];
2158    unsigned Op = getReductionBinOp(RdxDesc.Kind);
2159    setDebugLocFromInst(Builder, ReducedPartRdx);
2160    for (unsigned part = 1; part < UF; ++part) {
2161      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2162        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
2163                                             RdxParts[part], ReducedPartRdx,
2164                                             "bin.rdx");
2165      else
2166        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
2167                                        ReducedPartRdx, RdxParts[part]);
2168    }
2169
2170    if (VF > 1) {
2171      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
2172      // and vector ops, reducing the set of values being computed by half each
2173      // round.
2174      assert(isPowerOf2_32(VF) &&
2175             "Reduction emission only supported for pow2 vectors!");
2176      Value *TmpVec = ReducedPartRdx;
2177      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
2178      for (unsigned i = VF; i != 1; i >>= 1) {
2179        // Move the upper half of the vector to the lower half.
2180        for (unsigned j = 0; j != i/2; ++j)
2181          ShuffleMask[j] = Builder.getInt32(i/2 + j);
2182
2183        // Fill the rest of the mask with undef.
2184        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
2185                  UndefValue::get(Builder.getInt32Ty()));
2186
2187        Value *Shuf =
2188        Builder.CreateShuffleVector(TmpVec,
2189                                    UndefValue::get(TmpVec->getType()),
2190                                    ConstantVector::get(ShuffleMask),
2191                                    "rdx.shuf");
2192
2193        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2194          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
2195                                       "bin.rdx");
2196        else
2197          TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
2198      }
2199
2200      // The result is in the first element of the vector.
2201      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
2202                                                    Builder.getInt32(0));
2203    }
2204
2205    // Now, we need to fix the users of the reduction variable
2206    // inside and outside of the scalar remainder loop.
2207    // We know that the loop is in LCSSA form. We need to update the
2208    // PHI nodes in the exit blocks.
2209    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2210         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2211      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2212      if (!LCSSAPhi) break;
2213
2214      // All PHINodes need to have a single entry edge, or two if
2215      // we already fixed them.
2216      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
2217
2218      // We found our reduction value exit-PHI. Update it with the
2219      // incoming bypass edge.
2220      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
2221        // Add an edge coming from the bypass.
2222        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
2223        break;
2224      }
2225    }// end of the LCSSA phi scan.
2226
2227    // Fix the scalar loop reduction variable with the incoming reduction sum
2228    // from the vector body and from the backedge value.
2229    int IncomingEdgeBlockIdx =
2230    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
2231    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
2232    // Pick the other block.
2233    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
2234    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
2235    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
2236  }// end of for each redux variable.
2237
2238  fixLCSSAPHIs();
2239}
2240
2241void InnerLoopVectorizer::fixLCSSAPHIs() {
2242  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2243       LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2244    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2245    if (!LCSSAPhi) break;
2246    if (LCSSAPhi->getNumIncomingValues() == 1)
2247      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
2248                            LoopMiddleBlock);
2249  }
2250}
2251
2252InnerLoopVectorizer::VectorParts
2253InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
2254  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
2255         "Invalid edge");
2256
2257  // Look for cached value.
2258  std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
2259  EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
2260  if (ECEntryIt != MaskCache.end())
2261    return ECEntryIt->second;
2262
2263  VectorParts SrcMask = createBlockInMask(Src);
2264
2265  // The terminator has to be a branch inst!
2266  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
2267  assert(BI && "Unexpected terminator found");
2268
2269  if (BI->isConditional()) {
2270    VectorParts EdgeMask = getVectorValue(BI->getCondition());
2271
2272    if (BI->getSuccessor(0) != Dst)
2273      for (unsigned part = 0; part < UF; ++part)
2274        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
2275
2276    for (unsigned part = 0; part < UF; ++part)
2277      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
2278
2279    MaskCache[Edge] = EdgeMask;
2280    return EdgeMask;
2281  }
2282
2283  MaskCache[Edge] = SrcMask;
2284  return SrcMask;
2285}
2286
2287InnerLoopVectorizer::VectorParts
2288InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
2289  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
2290
2291  // Loop incoming mask is all-one.
2292  if (OrigLoop->getHeader() == BB) {
2293    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
2294    return getVectorValue(C);
2295  }
2296
2297  // This is the block mask. We OR all incoming edges, and with zero.
2298  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
2299  VectorParts BlockMask = getVectorValue(Zero);
2300
2301  // For each pred:
2302  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
2303    VectorParts EM = createEdgeMask(*it, BB);
2304    for (unsigned part = 0; part < UF; ++part)
2305      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
2306  }
2307
2308  return BlockMask;
2309}
2310
2311void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
2312                                              InnerLoopVectorizer::VectorParts &Entry,
2313                                              LoopVectorizationLegality *Legal,
2314                                              unsigned UF, unsigned VF, PhiVector *PV) {
2315  PHINode* P = cast<PHINode>(PN);
2316  // Handle reduction variables:
2317  if (Legal->getReductionVars()->count(P)) {
2318    for (unsigned part = 0; part < UF; ++part) {
2319      // This is phase one of vectorizing PHIs.
2320      Type *VecTy = (VF == 1) ? PN->getType() :
2321      VectorType::get(PN->getType(), VF);
2322      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
2323                                    LoopVectorBody-> getFirstInsertionPt());
2324    }
2325    PV->push_back(P);
2326    return;
2327  }
2328
2329  setDebugLocFromInst(Builder, P);
2330  // Check for PHI nodes that are lowered to vector selects.
2331  if (P->getParent() != OrigLoop->getHeader()) {
2332    // We know that all PHIs in non header blocks are converted into
2333    // selects, so we don't have to worry about the insertion order and we
2334    // can just use the builder.
2335    // At this point we generate the predication tree. There may be
2336    // duplications since this is a simple recursive scan, but future
2337    // optimizations will clean it up.
2338
2339    unsigned NumIncoming = P->getNumIncomingValues();
2340
2341    // Generate a sequence of selects of the form:
2342    // SELECT(Mask3, In3,
2343    //      SELECT(Mask2, In2,
2344    //                   ( ...)))
2345    for (unsigned In = 0; In < NumIncoming; In++) {
2346      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
2347                                        P->getParent());
2348      VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
2349
2350      for (unsigned part = 0; part < UF; ++part) {
2351        // We might have single edge PHIs (blocks) - use an identity
2352        // 'select' for the first PHI operand.
2353        if (In == 0)
2354          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
2355                                             In0[part]);
2356        else
2357          // Select between the current value and the previous incoming edge
2358          // based on the incoming mask.
2359          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
2360                                             Entry[part], "predphi");
2361      }
2362    }
2363    return;
2364  }
2365
2366  // This PHINode must be an induction variable.
2367  // Make sure that we know about it.
2368  assert(Legal->getInductionVars()->count(P) &&
2369         "Not an induction variable");
2370
2371  LoopVectorizationLegality::InductionInfo II =
2372  Legal->getInductionVars()->lookup(P);
2373
2374  switch (II.IK) {
2375    case LoopVectorizationLegality::IK_NoInduction:
2376      llvm_unreachable("Unknown induction");
2377    case LoopVectorizationLegality::IK_IntInduction: {
2378      assert(P->getType() == II.StartValue->getType() && "Types must match");
2379      Type *PhiTy = P->getType();
2380      Value *Broadcasted;
2381      if (P == OldInduction) {
2382        // Handle the canonical induction variable. We might have had to
2383        // extend the type.
2384        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
2385      } else {
2386        // Handle other induction variables that are now based on the
2387        // canonical one.
2388        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
2389                                                 "normalized.idx");
2390        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
2391        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
2392                                        "offset.idx");
2393      }
2394      Broadcasted = getBroadcastInstrs(Broadcasted);
2395      // After broadcasting the induction variable we need to make the vector
2396      // consecutive by adding 0, 1, 2, etc.
2397      for (unsigned part = 0; part < UF; ++part)
2398        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
2399      return;
2400    }
2401    case LoopVectorizationLegality::IK_ReverseIntInduction:
2402    case LoopVectorizationLegality::IK_PtrInduction:
2403    case LoopVectorizationLegality::IK_ReversePtrInduction:
2404      // Handle reverse integer and pointer inductions.
2405      Value *StartIdx = ExtendedIdx;
2406      // This is the normalized GEP that starts counting at zero.
2407      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
2408                                               "normalized.idx");
2409
2410      // Handle the reverse integer induction variable case.
2411      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
2412        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
2413        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
2414                                               "resize.norm.idx");
2415        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
2416                                               "reverse.idx");
2417
2418        // This is a new value so do not hoist it out.
2419        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
2420        // After broadcasting the induction variable we need to make the
2421        // vector consecutive by adding  ... -3, -2, -1, 0.
2422        for (unsigned part = 0; part < UF; ++part)
2423          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
2424                                             true);
2425        return;
2426      }
2427
2428      // Handle the pointer induction variable case.
2429      assert(P->getType()->isPointerTy() && "Unexpected type.");
2430
2431      // Is this a reverse induction ptr or a consecutive induction ptr.
2432      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
2433                      II.IK);
2434
2435      // This is the vector of results. Notice that we don't generate
2436      // vector geps because scalar geps result in better code.
2437      for (unsigned part = 0; part < UF; ++part) {
2438        if (VF == 1) {
2439          int EltIndex = (part) * (Reverse ? -1 : 1);
2440          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
2441          Value *GlobalIdx;
2442          if (Reverse)
2443            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
2444          else
2445            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
2446
2447          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
2448                                             "next.gep");
2449          Entry[part] = SclrGep;
2450          continue;
2451        }
2452
2453        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
2454        for (unsigned int i = 0; i < VF; ++i) {
2455          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
2456          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
2457          Value *GlobalIdx;
2458          if (!Reverse)
2459            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
2460          else
2461            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
2462
2463          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
2464                                             "next.gep");
2465          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
2466                                               Builder.getInt32(i),
2467                                               "insert.gep");
2468        }
2469        Entry[part] = VecVal;
2470      }
2471      return;
2472  }
2473}
2474
2475void
2476InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
2477                                          BasicBlock *BB, PhiVector *PV) {
2478  // For each instruction in the old loop.
2479  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
2480    VectorParts &Entry = WidenMap.get(it);
2481    switch (it->getOpcode()) {
2482    case Instruction::Br:
2483      // Nothing to do for PHIs and BR, since we already took care of the
2484      // loop control flow instructions.
2485      continue;
2486    case Instruction::PHI:{
2487      // Vectorize PHINodes.
2488      widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
2489      continue;
2490    }// End of PHI.
2491
2492    case Instruction::Add:
2493    case Instruction::FAdd:
2494    case Instruction::Sub:
2495    case Instruction::FSub:
2496    case Instruction::Mul:
2497    case Instruction::FMul:
2498    case Instruction::UDiv:
2499    case Instruction::SDiv:
2500    case Instruction::FDiv:
2501    case Instruction::URem:
2502    case Instruction::SRem:
2503    case Instruction::FRem:
2504    case Instruction::Shl:
2505    case Instruction::LShr:
2506    case Instruction::AShr:
2507    case Instruction::And:
2508    case Instruction::Or:
2509    case Instruction::Xor: {
2510      // Just widen binops.
2511      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
2512      setDebugLocFromInst(Builder, BinOp);
2513      VectorParts &A = getVectorValue(it->getOperand(0));
2514      VectorParts &B = getVectorValue(it->getOperand(1));
2515
2516      // Use this vector value for all users of the original instruction.
2517      for (unsigned Part = 0; Part < UF; ++Part) {
2518        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
2519
2520        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
2521        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
2522        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
2523          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
2524          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
2525        }
2526        if (VecOp && isa<PossiblyExactOperator>(VecOp))
2527          VecOp->setIsExact(BinOp->isExact());
2528
2529        Entry[Part] = V;
2530      }
2531      break;
2532    }
2533    case Instruction::Select: {
2534      // Widen selects.
2535      // If the selector is loop invariant we can create a select
2536      // instruction with a scalar condition. Otherwise, use vector-select.
2537      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
2538                                               OrigLoop);
2539      setDebugLocFromInst(Builder, it);
2540
2541      // The condition can be loop invariant  but still defined inside the
2542      // loop. This means that we can't just use the original 'cond' value.
2543      // We have to take the 'vectorized' value and pick the first lane.
2544      // Instcombine will make this a no-op.
2545      VectorParts &Cond = getVectorValue(it->getOperand(0));
2546      VectorParts &Op0  = getVectorValue(it->getOperand(1));
2547      VectorParts &Op1  = getVectorValue(it->getOperand(2));
2548
2549      Value *ScalarCond = (VF == 1) ? Cond[0] :
2550        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
2551
2552      for (unsigned Part = 0; Part < UF; ++Part) {
2553        Entry[Part] = Builder.CreateSelect(
2554          InvariantCond ? ScalarCond : Cond[Part],
2555          Op0[Part],
2556          Op1[Part]);
2557      }
2558      break;
2559    }
2560
2561    case Instruction::ICmp:
2562    case Instruction::FCmp: {
2563      // Widen compares. Generate vector compares.
2564      bool FCmp = (it->getOpcode() == Instruction::FCmp);
2565      CmpInst *Cmp = dyn_cast<CmpInst>(it);
2566      setDebugLocFromInst(Builder, it);
2567      VectorParts &A = getVectorValue(it->getOperand(0));
2568      VectorParts &B = getVectorValue(it->getOperand(1));
2569      for (unsigned Part = 0; Part < UF; ++Part) {
2570        Value *C = 0;
2571        if (FCmp)
2572          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
2573        else
2574          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
2575        Entry[Part] = C;
2576      }
2577      break;
2578    }
2579
2580    case Instruction::Store:
2581    case Instruction::Load:
2582        vectorizeMemoryInstruction(it, Legal);
2583        break;
2584    case Instruction::ZExt:
2585    case Instruction::SExt:
2586    case Instruction::FPToUI:
2587    case Instruction::FPToSI:
2588    case Instruction::FPExt:
2589    case Instruction::PtrToInt:
2590    case Instruction::IntToPtr:
2591    case Instruction::SIToFP:
2592    case Instruction::UIToFP:
2593    case Instruction::Trunc:
2594    case Instruction::FPTrunc:
2595    case Instruction::BitCast: {
2596      CastInst *CI = dyn_cast<CastInst>(it);
2597      setDebugLocFromInst(Builder, it);
2598      /// Optimize the special case where the source is the induction
2599      /// variable. Notice that we can only optimize the 'trunc' case
2600      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
2601      /// c. other casts depend on pointer size.
2602      if (CI->getOperand(0) == OldInduction &&
2603          it->getOpcode() == Instruction::Trunc) {
2604        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
2605                                               CI->getType());
2606        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
2607        for (unsigned Part = 0; Part < UF; ++Part)
2608          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
2609        break;
2610      }
2611      /// Vectorize casts.
2612      Type *DestTy = (VF == 1) ? CI->getType() :
2613                                 VectorType::get(CI->getType(), VF);
2614
2615      VectorParts &A = getVectorValue(it->getOperand(0));
2616      for (unsigned Part = 0; Part < UF; ++Part)
2617        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
2618      break;
2619    }
2620
2621    case Instruction::Call: {
2622      // Ignore dbg intrinsics.
2623      if (isa<DbgInfoIntrinsic>(it))
2624        break;
2625      setDebugLocFromInst(Builder, it);
2626
2627      Module *M = BB->getParent()->getParent();
2628      CallInst *CI = cast<CallInst>(it);
2629      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2630      assert(ID && "Not an intrinsic call!");
2631      switch (ID) {
2632      case Intrinsic::lifetime_end:
2633      case Intrinsic::lifetime_start:
2634        scalarizeInstruction(it);
2635        break;
2636      default:
2637        for (unsigned Part = 0; Part < UF; ++Part) {
2638          SmallVector<Value *, 4> Args;
2639          for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
2640            VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
2641            Args.push_back(Arg[Part]);
2642          }
2643          Type *Tys[] = {CI->getType()};
2644          if (VF > 1)
2645            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
2646
2647          Function *F = Intrinsic::getDeclaration(M, ID, Tys);
2648          Entry[Part] = Builder.CreateCall(F, Args);
2649        }
2650        break;
2651      }
2652      break;
2653    }
2654
2655    default:
2656      // All other instructions are unsupported. Scalarize them.
2657      scalarizeInstruction(it);
2658      break;
2659    }// end of switch.
2660  }// end of for_each instr.
2661}
2662
2663void InnerLoopVectorizer::updateAnalysis() {
2664  // Forget the original basic block.
2665  SE->forgetLoop(OrigLoop);
2666
2667  // Update the dominator tree information.
2668  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
2669         "Entry does not dominate exit.");
2670
2671  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2672    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
2673  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
2674  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
2675  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
2676  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
2677  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
2678  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2679
2680  DEBUG(DT->verifyAnalysis());
2681}
2682
2683bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
2684  if (!EnableIfConversion)
2685    return false;
2686
2687  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
2688  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
2689
2690  // A list of pointers that we can safely read and write to.
2691  SmallPtrSet<Value *, 8> SafePointes;
2692
2693  // Collect safe addresses.
2694  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
2695    BasicBlock *BB = LoopBlocks[i];
2696
2697    if (blockNeedsPredication(BB))
2698      continue;
2699
2700    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
2701      if (LoadInst *LI = dyn_cast<LoadInst>(I))
2702        SafePointes.insert(LI->getPointerOperand());
2703      else if (StoreInst *SI = dyn_cast<StoreInst>(I))
2704        SafePointes.insert(SI->getPointerOperand());
2705    }
2706  }
2707
2708  // Collect the blocks that need predication.
2709  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
2710    BasicBlock *BB = LoopBlocks[i];
2711
2712    // We don't support switch statements inside loops.
2713    if (!isa<BranchInst>(BB->getTerminator()))
2714      return false;
2715
2716    // We must be able to predicate all blocks that need to be predicated.
2717    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes))
2718      return false;
2719  }
2720
2721  // We can if-convert this loop.
2722  return true;
2723}
2724
2725bool LoopVectorizationLegality::canVectorize() {
2726  // We must have a loop in canonical form. Loops with indirectbr in them cannot
2727  // be canonicalized.
2728  if (!TheLoop->getLoopPreheader())
2729    return false;
2730
2731  // We can only vectorize innermost loops.
2732  if (TheLoop->getSubLoopsVector().size())
2733    return false;
2734
2735  // We must have a single backedge.
2736  if (TheLoop->getNumBackEdges() != 1)
2737    return false;
2738
2739  // We must have a single exiting block.
2740  if (!TheLoop->getExitingBlock())
2741    return false;
2742
2743  unsigned NumBlocks = TheLoop->getNumBlocks();
2744
2745  // Check if we can if-convert non single-bb loops.
2746  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
2747    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
2748    return false;
2749  }
2750
2751  // We need to have a loop header.
2752  BasicBlock *Latch = TheLoop->getLoopLatch();
2753  DEBUG(dbgs() << "LV: Found a loop: " <<
2754        TheLoop->getHeader()->getName() << "\n");
2755
2756  // ScalarEvolution needs to be able to find the exit count.
2757  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
2758  if (ExitCount == SE->getCouldNotCompute()) {
2759    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
2760    return false;
2761  }
2762
2763  // Do not loop-vectorize loops with a tiny trip count.
2764  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
2765  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
2766    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
2767          "This loop is not worth vectorizing.\n");
2768    return false;
2769  }
2770
2771  // Check if we can vectorize the instructions and CFG in this loop.
2772  if (!canVectorizeInstrs()) {
2773    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
2774    return false;
2775  }
2776
2777  // Go over each instruction and look at memory deps.
2778  if (!canVectorizeMemory()) {
2779    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
2780    return false;
2781  }
2782
2783  // Collect all of the variables that remain uniform after vectorization.
2784  collectLoopUniforms();
2785
2786  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
2787        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
2788        <<"!\n");
2789
2790  // Okay! We can vectorize. At this point we don't have any other mem analysis
2791  // which may limit our maximum vectorization factor, so just return true with
2792  // no restrictions.
2793  return true;
2794}
2795
2796static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
2797  if (Ty->isPointerTy())
2798    return DL.getIntPtrType(Ty);
2799
2800  return Ty;
2801}
2802
2803static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
2804  Ty0 = convertPointerToIntegerType(DL, Ty0);
2805  Ty1 = convertPointerToIntegerType(DL, Ty1);
2806  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
2807    return Ty0;
2808  return Ty1;
2809}
2810
2811/// \brief Check that the instruction has outside loop users and is not an
2812/// identified reduction variable.
2813static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
2814                               SmallPtrSet<Value *, 4> &Reductions) {
2815  // Reduction instructions are allowed to have exit users. All other
2816  // instructions must not have external users.
2817  if (!Reductions.count(Inst))
2818    //Check that all of the users of the loop are inside the BB.
2819    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
2820         I != E; ++I) {
2821      Instruction *U = cast<Instruction>(*I);
2822      // This user may be a reduction exit value.
2823      if (!TheLoop->contains(U)) {
2824        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
2825        return true;
2826      }
2827    }
2828  return false;
2829}
2830
2831bool LoopVectorizationLegality::canVectorizeInstrs() {
2832  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
2833  BasicBlock *Header = TheLoop->getHeader();
2834
2835  // Look for the attribute signaling the absence of NaNs.
2836  Function &F = *Header->getParent();
2837  if (F.hasFnAttribute("no-nans-fp-math"))
2838    HasFunNoNaNAttr = F.getAttributes().getAttribute(
2839      AttributeSet::FunctionIndex,
2840      "no-nans-fp-math").getValueAsString() == "true";
2841
2842  // For each block in the loop.
2843  for (Loop::block_iterator bb = TheLoop->block_begin(),
2844       be = TheLoop->block_end(); bb != be; ++bb) {
2845
2846    // Scan the instructions in the block and look for hazards.
2847    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
2848         ++it) {
2849
2850      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
2851        Type *PhiTy = Phi->getType();
2852        // Check that this PHI type is allowed.
2853        if (!PhiTy->isIntegerTy() &&
2854            !PhiTy->isFloatingPointTy() &&
2855            !PhiTy->isPointerTy()) {
2856          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
2857          return false;
2858        }
2859
2860        // If this PHINode is not in the header block, then we know that we
2861        // can convert it to select during if-conversion. No need to check if
2862        // the PHIs in this block are induction or reduction variables.
2863        if (*bb != Header) {
2864          // Check that this instruction has no outside users or is an
2865          // identified reduction value with an outside user.
2866          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
2867            continue;
2868          return false;
2869        }
2870
2871        // We only allow if-converted PHIs with more than two incoming values.
2872        if (Phi->getNumIncomingValues() != 2) {
2873          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
2874          return false;
2875        }
2876
2877        // This is the value coming from the preheader.
2878        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
2879        // Check if this is an induction variable.
2880        InductionKind IK = isInductionVariable(Phi);
2881
2882        if (IK_NoInduction != IK) {
2883          // Get the widest type.
2884          if (!WidestIndTy)
2885            WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
2886          else
2887            WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
2888
2889          // Int inductions are special because we only allow one IV.
2890          if (IK == IK_IntInduction) {
2891            // Use the phi node with the widest type as induction. Use the last
2892            // one if there are multiple (no good reason for doing this other
2893            // than it is expedient).
2894            if (!Induction || PhiTy == WidestIndTy)
2895              Induction = Phi;
2896          }
2897
2898          DEBUG(dbgs() << "LV: Found an induction variable.\n");
2899          Inductions[Phi] = InductionInfo(StartValue, IK);
2900
2901          // Until we explicitly handle the case of an induction variable with
2902          // an outside loop user we have to give up vectorizing this loop.
2903          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
2904            return false;
2905
2906          continue;
2907        }
2908
2909        if (AddReductionVar(Phi, RK_IntegerAdd)) {
2910          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
2911          continue;
2912        }
2913        if (AddReductionVar(Phi, RK_IntegerMult)) {
2914          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
2915          continue;
2916        }
2917        if (AddReductionVar(Phi, RK_IntegerOr)) {
2918          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
2919          continue;
2920        }
2921        if (AddReductionVar(Phi, RK_IntegerAnd)) {
2922          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
2923          continue;
2924        }
2925        if (AddReductionVar(Phi, RK_IntegerXor)) {
2926          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
2927          continue;
2928        }
2929        if (AddReductionVar(Phi, RK_IntegerMinMax)) {
2930          DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
2931          continue;
2932        }
2933        if (AddReductionVar(Phi, RK_FloatMult)) {
2934          DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
2935          continue;
2936        }
2937        if (AddReductionVar(Phi, RK_FloatAdd)) {
2938          DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
2939          continue;
2940        }
2941        if (AddReductionVar(Phi, RK_FloatMinMax)) {
2942          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<
2943                "\n");
2944          continue;
2945        }
2946
2947        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
2948        return false;
2949      }// end of PHI handling
2950
2951      // We still don't handle functions. However, we can ignore dbg intrinsic
2952      // calls and we do handle certain intrinsic and libm functions.
2953      CallInst *CI = dyn_cast<CallInst>(it);
2954      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
2955        DEBUG(dbgs() << "LV: Found a call site.\n");
2956        return false;
2957      }
2958
2959      // Check that the instruction return type is vectorizable.
2960      if (!VectorType::isValidElementType(it->getType()) &&
2961          !it->getType()->isVoidTy()) {
2962        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
2963        return false;
2964      }
2965
2966      // Check that the stored type is vectorizable.
2967      if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
2968        Type *T = ST->getValueOperand()->getType();
2969        if (!VectorType::isValidElementType(T))
2970          return false;
2971      }
2972
2973      // Reduction instructions are allowed to have exit users.
2974      // All other instructions must not have external users.
2975      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
2976        return false;
2977
2978    } // next instr.
2979
2980  }
2981
2982  if (!Induction) {
2983    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
2984    if (Inductions.empty())
2985      return false;
2986  }
2987
2988  return true;
2989}
2990
2991void LoopVectorizationLegality::collectLoopUniforms() {
2992  // We now know that the loop is vectorizable!
2993  // Collect variables that will remain uniform after vectorization.
2994  std::vector<Value*> Worklist;
2995  BasicBlock *Latch = TheLoop->getLoopLatch();
2996
2997  // Start with the conditional branch and walk up the block.
2998  Worklist.push_back(Latch->getTerminator()->getOperand(0));
2999
3000  while (Worklist.size()) {
3001    Instruction *I = dyn_cast<Instruction>(Worklist.back());
3002    Worklist.pop_back();
3003
3004    // Look at instructions inside this loop.
3005    // Stop when reaching PHI nodes.
3006    // TODO: we need to follow values all over the loop, not only in this block.
3007    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
3008      continue;
3009
3010    // This is a known uniform.
3011    Uniforms.insert(I);
3012
3013    // Insert all operands.
3014    Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
3015  }
3016}
3017
3018namespace {
3019/// \brief Analyses memory accesses in a loop.
3020///
3021/// Checks whether run time pointer checks are needed and builds sets for data
3022/// dependence checking.
3023class AccessAnalysis {
3024public:
3025  /// \brief Read or write access location.
3026  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
3027  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
3028
3029  /// \brief Set of potential dependent memory accesses.
3030  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
3031
3032  AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
3033    DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
3034    AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
3035
3036  /// \brief Register a load  and whether it is only read from.
3037  void addLoad(Value *Ptr, bool IsReadOnly) {
3038    Accesses.insert(MemAccessInfo(Ptr, false));
3039    if (IsReadOnly)
3040      ReadOnlyPtr.insert(Ptr);
3041  }
3042
3043  /// \brief Register a store.
3044  void addStore(Value *Ptr) {
3045    Accesses.insert(MemAccessInfo(Ptr, true));
3046  }
3047
3048  /// \brief Check whether we can check the pointers at runtime for
3049  /// non-intersection.
3050  bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
3051                       unsigned &NumComparisons, ScalarEvolution *SE,
3052                       Loop *TheLoop);
3053
3054  /// \brief Goes over all memory accesses, checks whether a RT check is needed
3055  /// and builds sets of dependent accesses.
3056  void buildDependenceSets() {
3057    // Process read-write pointers first.
3058    processMemAccesses(false);
3059    // Next, process read pointers.
3060    processMemAccesses(true);
3061  }
3062
3063  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
3064
3065  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
3066
3067  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
3068
3069private:
3070  typedef SetVector<MemAccessInfo> PtrAccessSet;
3071  typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
3072
3073  /// \brief Go over all memory access or only the deferred ones if
3074  /// \p UseDeferred is true and check whether runtime pointer checks are needed
3075  /// and build sets of dependency check candidates.
3076  void processMemAccesses(bool UseDeferred);
3077
3078  /// Set of all accesses.
3079  PtrAccessSet Accesses;
3080
3081  /// Set of access to check after all writes have been processed.
3082  PtrAccessSet DeferredAccesses;
3083
3084  /// Map of pointers to last access encountered.
3085  UnderlyingObjToAccessMap ObjToLastAccess;
3086
3087  /// Set of accesses that need a further dependence check.
3088  MemAccessInfoSet CheckDeps;
3089
3090  /// Set of pointers that are read only.
3091  SmallPtrSet<Value*, 16> ReadOnlyPtr;
3092
3093  /// Set of underlying objects already written to.
3094  SmallPtrSet<Value*, 16> WriteObjects;
3095
3096  DataLayout *DL;
3097
3098  /// Sets of potentially dependent accesses - members of one set share an
3099  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
3100  /// dependence check.
3101  DepCandidates &DepCands;
3102
3103  bool AreAllWritesIdentified;
3104  bool AreAllReadsIdentified;
3105  bool IsRTCheckNeeded;
3106};
3107
3108} // end anonymous namespace
3109
3110/// \brief Check whether a pointer can participate in a runtime bounds check.
3111static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
3112  const SCEV *PtrScev = SE->getSCEV(Ptr);
3113  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
3114  if (!AR)
3115    return false;
3116
3117  return AR->isAffine();
3118}
3119
3120bool AccessAnalysis::canCheckPtrAtRT(
3121                       LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
3122                        unsigned &NumComparisons, ScalarEvolution *SE,
3123                        Loop *TheLoop) {
3124  // Find pointers with computable bounds. We are going to use this information
3125  // to place a runtime bound check.
3126  unsigned NumReadPtrChecks = 0;
3127  unsigned NumWritePtrChecks = 0;
3128  bool CanDoRT = true;
3129
3130  bool IsDepCheckNeeded = isDependencyCheckNeeded();
3131  // We assign consecutive id to access from different dependence sets.
3132  // Accesses within the same set don't need a runtime check.
3133  unsigned RunningDepId = 1;
3134  DenseMap<Value *, unsigned> DepSetId;
3135
3136  for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end();
3137       AI != AE; ++AI) {
3138    const MemAccessInfo &Access = *AI;
3139    Value *Ptr = Access.getPointer();
3140    bool IsWrite = Access.getInt();
3141
3142    // Just add write checks if we have both.
3143    if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true)))
3144      continue;
3145
3146    if (IsWrite)
3147      ++NumWritePtrChecks;
3148    else
3149      ++NumReadPtrChecks;
3150
3151    if (hasComputableBounds(SE, Ptr)) {
3152      // The id of the dependence set.
3153      unsigned DepId;
3154
3155      if (IsDepCheckNeeded) {
3156        Value *Leader = DepCands.getLeaderValue(Access).getPointer();
3157        unsigned &LeaderId = DepSetId[Leader];
3158        if (!LeaderId)
3159          LeaderId = RunningDepId++;
3160        DepId = LeaderId;
3161      } else
3162        // Each access has its own dependence set.
3163        DepId = RunningDepId++;
3164
3165      RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
3166
3167      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
3168    } else {
3169      CanDoRT = false;
3170    }
3171  }
3172
3173  if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
3174    NumComparisons = 0; // Only one dependence set.
3175  else
3176    NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
3177                                           NumWritePtrChecks - 1));
3178  return CanDoRT;
3179}
3180
3181static bool isFunctionScopeIdentifiedObject(Value *Ptr) {
3182  return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
3183}
3184
3185void AccessAnalysis::processMemAccesses(bool UseDeferred) {
3186  // We process the set twice: first we process read-write pointers, last we
3187  // process read-only pointers. This allows us to skip dependence tests for
3188  // read-only pointers.
3189
3190  PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
3191  for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) {
3192    const MemAccessInfo &Access = *AI;
3193    Value *Ptr = Access.getPointer();
3194    bool IsWrite = Access.getInt();
3195
3196    DepCands.insert(Access);
3197
3198    // Memorize read-only pointers for later processing and skip them in the
3199    // first round (they need to be checked after we have seen all write
3200    // pointers). Note: we also mark pointer that are not consecutive as
3201    // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
3202    // second check for "!IsWrite".
3203    bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
3204    if (!UseDeferred && IsReadOnlyPtr) {
3205      DeferredAccesses.insert(Access);
3206      continue;
3207    }
3208
3209    bool NeedDepCheck = false;
3210    // Check whether there is the possiblity of dependency because of underlying
3211    // objects being the same.
3212    typedef SmallVector<Value*, 16> ValueVector;
3213    ValueVector TempObjects;
3214    GetUnderlyingObjects(Ptr, TempObjects, DL);
3215    for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
3216         UI != UE; ++UI) {
3217      Value *UnderlyingObj = *UI;
3218
3219      // If this is a write then it needs to be an identified object.  If this a
3220      // read and all writes (so far) are identified function scope objects we
3221      // don't need an identified underlying object but only an Argument (the
3222      // next write is going to invalidate this assumption if it is
3223      // unidentified).
3224      // This is a micro-optimization for the case where all writes are
3225      // identified and we have one argument pointer.
3226      // Otherwise, we do need a runtime check.
3227      if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
3228          (!IsWrite && (!AreAllWritesIdentified ||
3229                        !isa<Argument>(UnderlyingObj)) &&
3230           !isIdentifiedObject(UnderlyingObj))) {
3231        DEBUG(dbgs() << "LV: Found an unidentified " <<
3232              (IsWrite ?  "write" : "read" ) << " ptr:" << *UnderlyingObj <<
3233              "\n");
3234        IsRTCheckNeeded = (IsRTCheckNeeded ||
3235                           !isIdentifiedObject(UnderlyingObj) ||
3236                           !AreAllReadsIdentified);
3237
3238        if (IsWrite)
3239          AreAllWritesIdentified = false;
3240        if (!IsWrite)
3241          AreAllReadsIdentified = false;
3242      }
3243
3244      // If this is a write - check other reads and writes for conflicts.  If
3245      // this is a read only check other writes for conflicts (but only if there
3246      // is no other write to the ptr - this is an optimization to catch "a[i] =
3247      // a[i] + " without having to do a dependence check).
3248      if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
3249        NeedDepCheck = true;
3250
3251      if (IsWrite)
3252        WriteObjects.insert(UnderlyingObj);
3253
3254      // Create sets of pointers connected by shared underlying objects.
3255      UnderlyingObjToAccessMap::iterator Prev =
3256        ObjToLastAccess.find(UnderlyingObj);
3257      if (Prev != ObjToLastAccess.end())
3258        DepCands.unionSets(Access, Prev->second);
3259
3260      ObjToLastAccess[UnderlyingObj] = Access;
3261    }
3262
3263    if (NeedDepCheck)
3264      CheckDeps.insert(Access);
3265  }
3266}
3267
3268namespace {
3269/// \brief Checks memory dependences among accesses to the same underlying
3270/// object to determine whether there vectorization is legal or not (and at
3271/// which vectorization factor).
3272///
3273/// This class works under the assumption that we already checked that memory
3274/// locations with different underlying pointers are "must-not alias".
3275/// We use the ScalarEvolution framework to symbolically evalutate access
3276/// functions pairs. Since we currently don't restructure the loop we can rely
3277/// on the program order of memory accesses to determine their safety.
3278/// At the moment we will only deem accesses as safe for:
3279///  * A negative constant distance assuming program order.
3280///
3281///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
3282///            a[i] = tmp;                y = a[i];
3283///
3284///   The latter case is safe because later checks guarantuee that there can't
3285///   be a cycle through a phi node (that is, we check that "x" and "y" is not
3286///   the same variable: a header phi can only be an induction or a reduction, a
3287///   reduction can't have a memory sink, an induction can't have a memory
3288///   source). This is important and must not be violated (or we have to
3289///   resort to checking for cycles through memory).
3290///
3291///  * A positive constant distance assuming program order that is bigger
3292///    than the biggest memory access.
3293///
3294///     tmp = a[i]        OR              b[i] = x
3295///     a[i+2] = tmp                      y = b[i+2];
3296///
3297///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
3298///
3299///  * Zero distances and all accesses have the same size.
3300///
3301class MemoryDepChecker {
3302public:
3303  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
3304  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
3305
3306  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
3307    SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
3308
3309  /// \brief Register the location (instructions are given increasing numbers)
3310  /// of a write access.
3311  void addAccess(StoreInst *SI) {
3312    Value *Ptr = SI->getPointerOperand();
3313    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
3314    InstMap.push_back(SI);
3315    ++AccessIdx;
3316  }
3317
3318  /// \brief Register the location (instructions are given increasing numbers)
3319  /// of a write access.
3320  void addAccess(LoadInst *LI) {
3321    Value *Ptr = LI->getPointerOperand();
3322    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
3323    InstMap.push_back(LI);
3324    ++AccessIdx;
3325  }
3326
3327  /// \brief Check whether the dependencies between the accesses are safe.
3328  ///
3329  /// Only checks sets with elements in \p CheckDeps.
3330  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3331                   MemAccessInfoSet &CheckDeps);
3332
3333  /// \brief The maximum number of bytes of a vector register we can vectorize
3334  /// the accesses safely with.
3335  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
3336
3337private:
3338  ScalarEvolution *SE;
3339  DataLayout *DL;
3340  const Loop *InnermostLoop;
3341
3342  /// \brief Maps access locations (ptr, read/write) to program order.
3343  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
3344
3345  /// \brief Memory access instructions in program order.
3346  SmallVector<Instruction *, 16> InstMap;
3347
3348  /// \brief The program order index to be used for the next instruction.
3349  unsigned AccessIdx;
3350
3351  // We can access this many bytes in parallel safely.
3352  unsigned MaxSafeDepDistBytes;
3353
3354  /// \brief Check whether there is a plausible dependence between the two
3355  /// accesses.
3356  ///
3357  /// Access \p A must happen before \p B in program order. The two indices
3358  /// identify the index into the program order map.
3359  ///
3360  /// This function checks  whether there is a plausible dependence (or the
3361  /// absence of such can't be proved) between the two accesses. If there is a
3362  /// plausible dependence but the dependence distance is bigger than one
3363  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
3364  /// distance is smaller than any other distance encountered so far).
3365  /// Otherwise, this function returns true signaling a possible dependence.
3366  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
3367                   const MemAccessInfo &B, unsigned BIdx);
3368
3369  /// \brief Check whether the data dependence could prevent store-load
3370  /// forwarding.
3371  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
3372};
3373
3374} // end anonymous namespace
3375
3376static bool isInBoundsGep(Value *Ptr) {
3377  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
3378    return GEP->isInBounds();
3379  return false;
3380}
3381
3382/// \brief Check whether the access through \p Ptr has a constant stride.
3383static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
3384                        const Loop *Lp) {
3385  const Type *Ty = Ptr->getType();
3386  assert(Ty->isPointerTy() && "Unexpected non ptr");
3387
3388  // Make sure that the pointer does not point to aggregate types.
3389  const PointerType *PtrTy = cast<PointerType>(Ty);
3390  if (PtrTy->getElementType()->isAggregateType()) {
3391    DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<
3392          "\n");
3393    return 0;
3394  }
3395
3396  const SCEV *PtrScev = SE->getSCEV(Ptr);
3397  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
3398  if (!AR) {
3399    DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
3400          << *Ptr << " SCEV: " << *PtrScev << "\n");
3401    return 0;
3402  }
3403
3404  // The accesss function must stride over the innermost loop.
3405  if (Lp != AR->getLoop()) {
3406    DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<
3407          *Ptr << " SCEV: " << *PtrScev << "\n");
3408  }
3409
3410  // The address calculation must not wrap. Otherwise, a dependence could be
3411  // inverted.
3412  // An inbounds getelementptr that is a AddRec with a unit stride
3413  // cannot wrap per definition. The unit stride requirement is checked later.
3414  // An getelementptr without an inbounds attribute and unit stride would have
3415  // to access the pointer value "0" which is undefined behavior in address
3416  // space 0, therefore we can also vectorize this case.
3417  bool IsInBoundsGEP = isInBoundsGep(Ptr);
3418  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
3419  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
3420  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
3421    DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
3422          << *Ptr << " SCEV: " << *PtrScev << "\n");
3423    return 0;
3424  }
3425
3426  // Check the step is constant.
3427  const SCEV *Step = AR->getStepRecurrence(*SE);
3428
3429  // Calculate the pointer stride and check if it is consecutive.
3430  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
3431  if (!C) {
3432    DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
3433          " SCEV: " << *PtrScev << "\n");
3434    return 0;
3435  }
3436
3437  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
3438  const APInt &APStepVal = C->getValue()->getValue();
3439
3440  // Huge step value - give up.
3441  if (APStepVal.getBitWidth() > 64)
3442    return 0;
3443
3444  int64_t StepVal = APStepVal.getSExtValue();
3445
3446  // Strided access.
3447  int64_t Stride = StepVal / Size;
3448  int64_t Rem = StepVal % Size;
3449  if (Rem)
3450    return 0;
3451
3452  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
3453  // know we can't "wrap around the address space". In case of address space
3454  // zero we know that this won't happen without triggering undefined behavior.
3455  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
3456      Stride != 1 && Stride != -1)
3457    return 0;
3458
3459  return Stride;
3460}
3461
3462bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
3463                                                    unsigned TypeByteSize) {
3464  // If loads occur at a distance that is not a multiple of a feasible vector
3465  // factor store-load forwarding does not take place.
3466  // Positive dependences might cause troubles because vectorizing them might
3467  // prevent store-load forwarding making vectorized code run a lot slower.
3468  //   a[i] = a[i-3] ^ a[i-8];
3469  //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
3470  //   hence on your typical architecture store-load forwarding does not take
3471  //   place. Vectorizing in such cases does not make sense.
3472  // Store-load forwarding distance.
3473  const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
3474  // Maximum vector factor.
3475  unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
3476  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
3477    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
3478
3479  for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
3480       vf *= 2) {
3481    if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
3482      MaxVFWithoutSLForwardIssues = (vf >>=1);
3483      break;
3484    }
3485  }
3486
3487  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
3488    DEBUG(dbgs() << "LV: Distance " << Distance <<
3489          " that could cause a store-load forwarding conflict\n");
3490    return true;
3491  }
3492
3493  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
3494      MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
3495    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
3496  return false;
3497}
3498
3499bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
3500                                   const MemAccessInfo &B, unsigned BIdx) {
3501  assert (AIdx < BIdx && "Must pass arguments in program order");
3502
3503  Value *APtr = A.getPointer();
3504  Value *BPtr = B.getPointer();
3505  bool AIsWrite = A.getInt();
3506  bool BIsWrite = B.getInt();
3507
3508  // Two reads are independent.
3509  if (!AIsWrite && !BIsWrite)
3510    return false;
3511
3512  const SCEV *AScev = SE->getSCEV(APtr);
3513  const SCEV *BScev = SE->getSCEV(BPtr);
3514
3515  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
3516  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
3517
3518  const SCEV *Src = AScev;
3519  const SCEV *Sink = BScev;
3520
3521  // If the induction step is negative we have to invert source and sink of the
3522  // dependence.
3523  if (StrideAPtr < 0) {
3524    //Src = BScev;
3525    //Sink = AScev;
3526    std::swap(APtr, BPtr);
3527    std::swap(Src, Sink);
3528    std::swap(AIsWrite, BIsWrite);
3529    std::swap(AIdx, BIdx);
3530    std::swap(StrideAPtr, StrideBPtr);
3531  }
3532
3533  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
3534
3535  DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
3536        << "(Induction step: " << StrideAPtr <<  ")\n");
3537  DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
3538        << *InstMap[BIdx] << ": " << *Dist << "\n");
3539
3540  // Need consecutive accesses. We don't want to vectorize
3541  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
3542  // the address space.
3543  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
3544    DEBUG(dbgs() << "Non-consecutive pointer access\n");
3545    return true;
3546  }
3547
3548  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
3549  if (!C) {
3550    DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
3551    return true;
3552  }
3553
3554  Type *ATy = APtr->getType()->getPointerElementType();
3555  Type *BTy = BPtr->getType()->getPointerElementType();
3556  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
3557
3558  // Negative distances are not plausible dependencies.
3559  const APInt &Val = C->getValue()->getValue();
3560  if (Val.isNegative()) {
3561    bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
3562    if (IsTrueDataDependence &&
3563        (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
3564         ATy != BTy))
3565      return true;
3566
3567    DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
3568    return false;
3569  }
3570
3571  // Write to the same location with the same size.
3572  // Could be improved to assert type sizes are the same (i32 == float, etc).
3573  if (Val == 0) {
3574    if (ATy == BTy)
3575      return false;
3576    DEBUG(dbgs() << "LV: Zero dependence difference but different types");
3577    return true;
3578  }
3579
3580  assert(Val.isStrictlyPositive() && "Expect a positive value");
3581
3582  // Positive distance bigger than max vectorization factor.
3583  if (ATy != BTy) {
3584    DEBUG(dbgs() <<
3585          "LV: ReadWrite-Write positive dependency with different types");
3586    return false;
3587  }
3588
3589  unsigned Distance = (unsigned) Val.getZExtValue();
3590
3591  // Bail out early if passed-in parameters make vectorization not feasible.
3592  unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
3593  unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
3594
3595  // The distance must be bigger than the size needed for a vectorized version
3596  // of the operation and the size of the vectorized operation must not be
3597  // bigger than the currrent maximum size.
3598  if (Distance < 2*TypeByteSize ||
3599      2*TypeByteSize > MaxSafeDepDistBytes ||
3600      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
3601    DEBUG(dbgs() << "LV: Failure because of Positive distance "
3602        << Val.getSExtValue() << "\n");
3603    return true;
3604  }
3605
3606  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
3607    Distance : MaxSafeDepDistBytes;
3608
3609  bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
3610  if (IsTrueDataDependence &&
3611      couldPreventStoreLoadForward(Distance, TypeByteSize))
3612     return true;
3613
3614  DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
3615        " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
3616
3617  return false;
3618}
3619
3620bool
3621MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3622                              MemAccessInfoSet &CheckDeps) {
3623
3624  MaxSafeDepDistBytes = -1U;
3625  while (!CheckDeps.empty()) {
3626    MemAccessInfo CurAccess = *CheckDeps.begin();
3627
3628    // Get the relevant memory access set.
3629    EquivalenceClasses<MemAccessInfo>::iterator I =
3630      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
3631
3632    // Check accesses within this set.
3633    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
3634    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
3635
3636    // Check every access pair.
3637    while (AI != AE) {
3638      CheckDeps.erase(*AI);
3639      EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
3640      while (OI != AE) {
3641        // Check every accessing instruction pair in program order.
3642        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
3643             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
3644          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
3645               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
3646            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
3647              return false;
3648            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
3649              return false;
3650          }
3651        ++OI;
3652      }
3653      AI++;
3654    }
3655  }
3656  return true;
3657}
3658
3659bool LoopVectorizationLegality::canVectorizeMemory() {
3660
3661  typedef SmallVector<Value*, 16> ValueVector;
3662  typedef SmallPtrSet<Value*, 16> ValueSet;
3663
3664  // Holds the Load and Store *instructions*.
3665  ValueVector Loads;
3666  ValueVector Stores;
3667
3668  // Holds all the different accesses in the loop.
3669  unsigned NumReads = 0;
3670  unsigned NumReadWrites = 0;
3671
3672  PtrRtCheck.Pointers.clear();
3673  PtrRtCheck.Need = false;
3674
3675  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
3676  MemoryDepChecker DepChecker(SE, DL, TheLoop);
3677
3678  // For each block.
3679  for (Loop::block_iterator bb = TheLoop->block_begin(),
3680       be = TheLoop->block_end(); bb != be; ++bb) {
3681
3682    // Scan the BB and collect legal loads and stores.
3683    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
3684         ++it) {
3685
3686      // If this is a load, save it. If this instruction can read from memory
3687      // but is not a load, then we quit. Notice that we don't handle function
3688      // calls that read or write.
3689      if (it->mayReadFromMemory()) {
3690        // Many math library functions read the rounding mode. We will only
3691        // vectorize a loop if it contains known function calls that don't set
3692        // the flag. Therefore, it is safe to ignore this read from memory.
3693        CallInst *Call = dyn_cast<CallInst>(it);
3694        if (Call && getIntrinsicIDForCall(Call, TLI))
3695          continue;
3696
3697        LoadInst *Ld = dyn_cast<LoadInst>(it);
3698        if (!Ld) return false;
3699        if (!Ld->isSimple() && !IsAnnotatedParallel) {
3700          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
3701          return false;
3702        }
3703        Loads.push_back(Ld);
3704        DepChecker.addAccess(Ld);
3705        continue;
3706      }
3707
3708      // Save 'store' instructions. Abort if other instructions write to memory.
3709      if (it->mayWriteToMemory()) {
3710        StoreInst *St = dyn_cast<StoreInst>(it);
3711        if (!St) return false;
3712        if (!St->isSimple() && !IsAnnotatedParallel) {
3713          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
3714          return false;
3715        }
3716        Stores.push_back(St);
3717        DepChecker.addAccess(St);
3718      }
3719    } // next instr.
3720  } // next block.
3721
3722  // Now we have two lists that hold the loads and the stores.
3723  // Next, we find the pointers that they use.
3724
3725  // Check if we see any stores. If there are no stores, then we don't
3726  // care if the pointers are *restrict*.
3727  if (!Stores.size()) {
3728    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
3729    return true;
3730  }
3731
3732  AccessAnalysis::DepCandidates DependentAccesses;
3733  AccessAnalysis Accesses(DL, DependentAccesses);
3734
3735  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
3736  // multiple times on the same object. If the ptr is accessed twice, once
3737  // for read and once for write, it will only appear once (on the write
3738  // list). This is okay, since we are going to check for conflicts between
3739  // writes and between reads and writes, but not between reads and reads.
3740  ValueSet Seen;
3741
3742  ValueVector::iterator I, IE;
3743  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
3744    StoreInst *ST = cast<StoreInst>(*I);
3745    Value* Ptr = ST->getPointerOperand();
3746
3747    if (isUniform(Ptr)) {
3748      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
3749      return false;
3750    }
3751
3752    // If we did *not* see this pointer before, insert it to  the read-write
3753    // list. At this phase it is only a 'write' list.
3754    if (Seen.insert(Ptr)) {
3755      ++NumReadWrites;
3756      Accesses.addStore(Ptr);
3757    }
3758  }
3759
3760  if (IsAnnotatedParallel) {
3761    DEBUG(dbgs()
3762          << "LV: A loop annotated parallel, ignore memory dependency "
3763          << "checks.\n");
3764    return true;
3765  }
3766
3767  SmallPtrSet<Value *, 16> ReadOnlyPtr;
3768  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
3769    LoadInst *LD = cast<LoadInst>(*I);
3770    Value* Ptr = LD->getPointerOperand();
3771    // If we did *not* see this pointer before, insert it to the
3772    // read list. If we *did* see it before, then it is already in
3773    // the read-write list. This allows us to vectorize expressions
3774    // such as A[i] += x;  Because the address of A[i] is a read-write
3775    // pointer. This only works if the index of A[i] is consecutive.
3776    // If the address of i is unknown (for example A[B[i]]) then we may
3777    // read a few words, modify, and write a few words, and some of the
3778    // words may be written to the same address.
3779    bool IsReadOnlyPtr = false;
3780    if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) {
3781      ++NumReads;
3782      IsReadOnlyPtr = true;
3783    }
3784    Accesses.addLoad(Ptr, IsReadOnlyPtr);
3785  }
3786
3787  // If we write (or read-write) to a single destination and there are no
3788  // other reads in this loop then is it safe to vectorize.
3789  if (NumReadWrites == 1 && NumReads == 0) {
3790    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
3791    return true;
3792  }
3793
3794  // Build dependence sets and check whether we need a runtime pointer bounds
3795  // check.
3796  Accesses.buildDependenceSets();
3797  bool NeedRTCheck = Accesses.isRTCheckNeeded();
3798
3799  // Find pointers with computable bounds. We are going to use this information
3800  // to place a runtime bound check.
3801  unsigned NumComparisons = 0;
3802  bool CanDoRT = false;
3803  if (NeedRTCheck)
3804    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop);
3805
3806
3807  DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
3808        " pointer comparisons.\n");
3809
3810  // If we only have one set of dependences to check pointers among we don't
3811  // need a runtime check.
3812  if (NumComparisons == 0 && NeedRTCheck)
3813    NeedRTCheck = false;
3814
3815  // Check that we did not collect too many pointers or found a unsizeable
3816  // pointer.
3817  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
3818    PtrRtCheck.reset();
3819    CanDoRT = false;
3820  }
3821
3822  if (CanDoRT) {
3823    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
3824  }
3825
3826  if (NeedRTCheck && !CanDoRT) {
3827    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
3828          "the array bounds.\n");
3829    PtrRtCheck.reset();
3830    return false;
3831  }
3832
3833  PtrRtCheck.Need = NeedRTCheck;
3834
3835  bool CanVecMem = true;
3836  if (Accesses.isDependencyCheckNeeded()) {
3837    DEBUG(dbgs() << "LV: Checking memory dependencies\n");
3838    CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
3839                                       Accesses.getDependenciesToCheck());
3840    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
3841  }
3842
3843  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
3844        " need a runtime memory check.\n");
3845
3846  return CanVecMem;
3847}
3848
3849static bool hasMultipleUsesOf(Instruction *I,
3850                              SmallPtrSet<Instruction *, 8> &Insts) {
3851  unsigned NumUses = 0;
3852  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
3853    if (Insts.count(dyn_cast<Instruction>(*Use)))
3854      ++NumUses;
3855    if (NumUses > 1)
3856      return true;
3857  }
3858
3859  return false;
3860}
3861
3862static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
3863  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
3864    if (!Set.count(dyn_cast<Instruction>(*Use)))
3865      return false;
3866  return true;
3867}
3868
3869bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
3870                                                ReductionKind Kind) {
3871  if (Phi->getNumIncomingValues() != 2)
3872    return false;
3873
3874  // Reduction variables are only found in the loop header block.
3875  if (Phi->getParent() != TheLoop->getHeader())
3876    return false;
3877
3878  // Obtain the reduction start value from the value that comes from the loop
3879  // preheader.
3880  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
3881
3882  // ExitInstruction is the single value which is used outside the loop.
3883  // We only allow for a single reduction value to be used outside the loop.
3884  // This includes users of the reduction, variables (which form a cycle
3885  // which ends in the phi node).
3886  Instruction *ExitInstruction = 0;
3887  // Indicates that we found a reduction operation in our scan.
3888  bool FoundReduxOp = false;
3889
3890  // We start with the PHI node and scan for all of the users of this
3891  // instruction. All users must be instructions that can be used as reduction
3892  // variables (such as ADD). We must have a single out-of-block user. The cycle
3893  // must include the original PHI.
3894  bool FoundStartPHI = false;
3895
3896  // To recognize min/max patterns formed by a icmp select sequence, we store
3897  // the number of instruction we saw from the recognized min/max pattern,
3898  //  to make sure we only see exactly the two instructions.
3899  unsigned NumCmpSelectPatternInst = 0;
3900  ReductionInstDesc ReduxDesc(false, 0);
3901
3902  SmallPtrSet<Instruction *, 8> VisitedInsts;
3903  SmallVector<Instruction *, 8> Worklist;
3904  Worklist.push_back(Phi);
3905  VisitedInsts.insert(Phi);
3906
3907  // A value in the reduction can be used:
3908  //  - By the reduction:
3909  //      - Reduction operation:
3910  //        - One use of reduction value (safe).
3911  //        - Multiple use of reduction value (not safe).
3912  //      - PHI:
3913  //        - All uses of the PHI must be the reduction (safe).
3914  //        - Otherwise, not safe.
3915  //  - By one instruction outside of the loop (safe).
3916  //  - By further instructions outside of the loop (not safe).
3917  //  - By an instruction that is not part of the reduction (not safe).
3918  //    This is either:
3919  //      * An instruction type other than PHI or the reduction operation.
3920  //      * A PHI in the header other than the initial PHI.
3921  while (!Worklist.empty()) {
3922    Instruction *Cur = Worklist.back();
3923    Worklist.pop_back();
3924
3925    // No Users.
3926    // If the instruction has no users then this is a broken chain and can't be
3927    // a reduction variable.
3928    if (Cur->use_empty())
3929      return false;
3930
3931    bool IsAPhi = isa<PHINode>(Cur);
3932
3933    // A header PHI use other than the original PHI.
3934    if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
3935      return false;
3936
3937    // Reductions of instructions such as Div, and Sub is only possible if the
3938    // LHS is the reduction variable.
3939    if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
3940        !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
3941        !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
3942      return false;
3943
3944    // Any reduction instruction must be of one of the allowed kinds.
3945    ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
3946    if (!ReduxDesc.IsReduction)
3947      return false;
3948
3949    // A reduction operation must only have one use of the reduction value.
3950    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
3951        hasMultipleUsesOf(Cur, VisitedInsts))
3952      return false;
3953
3954    // All inputs to a PHI node must be a reduction value.
3955    if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
3956      return false;
3957
3958    if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
3959                                     isa<SelectInst>(Cur)))
3960      ++NumCmpSelectPatternInst;
3961    if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
3962                                   isa<SelectInst>(Cur)))
3963      ++NumCmpSelectPatternInst;
3964
3965    // Check  whether we found a reduction operator.
3966    FoundReduxOp |= !IsAPhi;
3967
3968    // Process users of current instruction. Push non PHI nodes after PHI nodes
3969    // onto the stack. This way we are going to have seen all inputs to PHI
3970    // nodes once we get to them.
3971    SmallVector<Instruction *, 8> NonPHIs;
3972    SmallVector<Instruction *, 8> PHIs;
3973    for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
3974         ++UI) {
3975      Instruction *Usr = cast<Instruction>(*UI);
3976
3977      // Check if we found the exit user.
3978      BasicBlock *Parent = Usr->getParent();
3979      if (!TheLoop->contains(Parent)) {
3980        // Exit if you find multiple outside users or if the header phi node is
3981        // being used. In this case the user uses the value of the previous
3982        // iteration, in which case we would loose "VF-1" iterations of the
3983        // reduction operation if we vectorize.
3984        if (ExitInstruction != 0 || Cur == Phi)
3985          return false;
3986
3987        ExitInstruction = Cur;
3988        continue;
3989      }
3990
3991      // Process instructions only once (termination).
3992      if (VisitedInsts.insert(Usr)) {
3993        if (isa<PHINode>(Usr))
3994          PHIs.push_back(Usr);
3995        else
3996          NonPHIs.push_back(Usr);
3997      }
3998      // Remember that we completed the cycle.
3999      if (Usr == Phi)
4000        FoundStartPHI = true;
4001    }
4002    Worklist.append(PHIs.begin(), PHIs.end());
4003    Worklist.append(NonPHIs.begin(), NonPHIs.end());
4004  }
4005
4006  // This means we have seen one but not the other instruction of the
4007  // pattern or more than just a select and cmp.
4008  if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
4009      NumCmpSelectPatternInst != 2)
4010    return false;
4011
4012  if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
4013    return false;
4014
4015  // We found a reduction var if we have reached the original phi node and we
4016  // only have a single instruction with out-of-loop users.
4017
4018  // This instruction is allowed to have out-of-loop users.
4019  AllowedExit.insert(ExitInstruction);
4020
4021  // Save the description of this reduction variable.
4022  ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
4023                         ReduxDesc.MinMaxKind);
4024  Reductions[Phi] = RD;
4025  // We've ended the cycle. This is a reduction variable if we have an
4026  // outside user and it has a binary op.
4027
4028  return true;
4029}
4030
4031/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
4032/// pattern corresponding to a min(X, Y) or max(X, Y).
4033LoopVectorizationLegality::ReductionInstDesc
4034LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
4035                                                    ReductionInstDesc &Prev) {
4036
4037  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
4038         "Expect a select instruction");
4039  Instruction *Cmp = 0;
4040  SelectInst *Select = 0;
4041
4042  // We must handle the select(cmp()) as a single instruction. Advance to the
4043  // select.
4044  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
4045    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
4046      return ReductionInstDesc(false, I);
4047    return ReductionInstDesc(Select, Prev.MinMaxKind);
4048  }
4049
4050  // Only handle single use cases for now.
4051  if (!(Select = dyn_cast<SelectInst>(I)))
4052    return ReductionInstDesc(false, I);
4053  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
4054      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
4055    return ReductionInstDesc(false, I);
4056  if (!Cmp->hasOneUse())
4057    return ReductionInstDesc(false, I);
4058
4059  Value *CmpLeft;
4060  Value *CmpRight;
4061
4062  // Look for a min/max pattern.
4063  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4064    return ReductionInstDesc(Select, MRK_UIntMin);
4065  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4066    return ReductionInstDesc(Select, MRK_UIntMax);
4067  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4068    return ReductionInstDesc(Select, MRK_SIntMax);
4069  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4070    return ReductionInstDesc(Select, MRK_SIntMin);
4071  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4072    return ReductionInstDesc(Select, MRK_FloatMin);
4073  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4074    return ReductionInstDesc(Select, MRK_FloatMax);
4075  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4076    return ReductionInstDesc(Select, MRK_FloatMin);
4077  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
4078    return ReductionInstDesc(Select, MRK_FloatMax);
4079
4080  return ReductionInstDesc(false, I);
4081}
4082
4083LoopVectorizationLegality::ReductionInstDesc
4084LoopVectorizationLegality::isReductionInstr(Instruction *I,
4085                                            ReductionKind Kind,
4086                                            ReductionInstDesc &Prev) {
4087  bool FP = I->getType()->isFloatingPointTy();
4088  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
4089  switch (I->getOpcode()) {
4090  default:
4091    return ReductionInstDesc(false, I);
4092  case Instruction::PHI:
4093      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
4094                 Kind != RK_FloatMinMax))
4095        return ReductionInstDesc(false, I);
4096    return ReductionInstDesc(I, Prev.MinMaxKind);
4097  case Instruction::Sub:
4098  case Instruction::Add:
4099    return ReductionInstDesc(Kind == RK_IntegerAdd, I);
4100  case Instruction::Mul:
4101    return ReductionInstDesc(Kind == RK_IntegerMult, I);
4102  case Instruction::And:
4103    return ReductionInstDesc(Kind == RK_IntegerAnd, I);
4104  case Instruction::Or:
4105    return ReductionInstDesc(Kind == RK_IntegerOr, I);
4106  case Instruction::Xor:
4107    return ReductionInstDesc(Kind == RK_IntegerXor, I);
4108  case Instruction::FMul:
4109    return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
4110  case Instruction::FAdd:
4111    return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
4112  case Instruction::FCmp:
4113  case Instruction::ICmp:
4114  case Instruction::Select:
4115    if (Kind != RK_IntegerMinMax &&
4116        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
4117      return ReductionInstDesc(false, I);
4118    return isMinMaxSelectCmpPattern(I, Prev);
4119  }
4120}
4121
4122LoopVectorizationLegality::InductionKind
4123LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
4124  Type *PhiTy = Phi->getType();
4125  // We only handle integer and pointer inductions variables.
4126  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
4127    return IK_NoInduction;
4128
4129  // Check that the PHI is consecutive.
4130  const SCEV *PhiScev = SE->getSCEV(Phi);
4131  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
4132  if (!AR) {
4133    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
4134    return IK_NoInduction;
4135  }
4136  const SCEV *Step = AR->getStepRecurrence(*SE);
4137
4138  // Integer inductions need to have a stride of one.
4139  if (PhiTy->isIntegerTy()) {
4140    if (Step->isOne())
4141      return IK_IntInduction;
4142    if (Step->isAllOnesValue())
4143      return IK_ReverseIntInduction;
4144    return IK_NoInduction;
4145  }
4146
4147  // Calculate the pointer stride and check if it is consecutive.
4148  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4149  if (!C)
4150    return IK_NoInduction;
4151
4152  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
4153  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
4154  if (C->getValue()->equalsInt(Size))
4155    return IK_PtrInduction;
4156  else if (C->getValue()->equalsInt(0 - Size))
4157    return IK_ReversePtrInduction;
4158
4159  return IK_NoInduction;
4160}
4161
4162bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
4163  Value *In0 = const_cast<Value*>(V);
4164  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
4165  if (!PN)
4166    return false;
4167
4168  return Inductions.count(PN);
4169}
4170
4171bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
4172  assert(TheLoop->contains(BB) && "Unknown block used");
4173
4174  // Blocks that do not dominate the latch need predication.
4175  BasicBlock* Latch = TheLoop->getLoopLatch();
4176  return !DT->dominates(BB, Latch);
4177}
4178
4179bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
4180                                            SmallPtrSet<Value *, 8>& SafePtrs) {
4181  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4182    // We might be able to hoist the load.
4183    if (it->mayReadFromMemory()) {
4184      LoadInst *LI = dyn_cast<LoadInst>(it);
4185      if (!LI || !SafePtrs.count(LI->getPointerOperand()))
4186        return false;
4187    }
4188
4189    // We don't predicate stores at the moment.
4190    if (it->mayWriteToMemory() || it->mayThrow())
4191      return false;
4192
4193    // The instructions below can trap.
4194    switch (it->getOpcode()) {
4195    default: continue;
4196    case Instruction::UDiv:
4197    case Instruction::SDiv:
4198    case Instruction::URem:
4199    case Instruction::SRem:
4200             return false;
4201    }
4202  }
4203
4204  return true;
4205}
4206
4207LoopVectorizationCostModel::VectorizationFactor
4208LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
4209                                                      unsigned UserVF) {
4210  // Width 1 means no vectorize
4211  VectorizationFactor Factor = { 1U, 0U };
4212  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
4213    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
4214    return Factor;
4215  }
4216
4217  // Find the trip count.
4218  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
4219  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
4220
4221  unsigned WidestType = getWidestType();
4222  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4223  unsigned MaxSafeDepDist = -1U;
4224  if (Legal->getMaxSafeDepDistBytes() != -1U)
4225    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
4226  WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
4227                    WidestRegister : MaxSafeDepDist);
4228  unsigned MaxVectorSize = WidestRegister / WidestType;
4229  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
4230  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
4231
4232  if (MaxVectorSize == 0) {
4233    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4234    MaxVectorSize = 1;
4235  }
4236
4237  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
4238         " into one vector!");
4239
4240  unsigned VF = MaxVectorSize;
4241
4242  // If we optimize the program for size, avoid creating the tail loop.
4243  if (OptForSize) {
4244    // If we are unable to calculate the trip count then don't try to vectorize.
4245    if (TC < 2) {
4246      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4247      return Factor;
4248    }
4249
4250    // Find the maximum SIMD width that can fit within the trip count.
4251    VF = TC % MaxVectorSize;
4252
4253    if (VF == 0)
4254      VF = MaxVectorSize;
4255
4256    // If the trip count that we found modulo the vectorization factor is not
4257    // zero then we require a tail.
4258    if (VF < 2) {
4259      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4260      return Factor;
4261    }
4262  }
4263
4264  if (UserVF != 0) {
4265    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
4266    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
4267
4268    Factor.Width = UserVF;
4269    return Factor;
4270  }
4271
4272  float Cost = expectedCost(1);
4273  unsigned Width = 1;
4274  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
4275  for (unsigned i=2; i <= VF; i*=2) {
4276    // Notice that the vector loop needs to be executed less times, so
4277    // we need to divide the cost of the vector loops by the width of
4278    // the vector elements.
4279    float VectorCost = expectedCost(i) / (float)i;
4280    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
4281          (int)VectorCost << ".\n");
4282    if (VectorCost < Cost) {
4283      Cost = VectorCost;
4284      Width = i;
4285    }
4286  }
4287
4288  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
4289  Factor.Width = Width;
4290  Factor.Cost = Width * Cost;
4291  return Factor;
4292}
4293
4294unsigned LoopVectorizationCostModel::getWidestType() {
4295  unsigned MaxWidth = 8;
4296
4297  // For each block.
4298  for (Loop::block_iterator bb = TheLoop->block_begin(),
4299       be = TheLoop->block_end(); bb != be; ++bb) {
4300    BasicBlock *BB = *bb;
4301
4302    // For each instruction in the loop.
4303    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4304      Type *T = it->getType();
4305
4306      // Only examine Loads, Stores and PHINodes.
4307      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
4308        continue;
4309
4310      // Examine PHI nodes that are reduction variables.
4311      if (PHINode *PN = dyn_cast<PHINode>(it))
4312        if (!Legal->getReductionVars()->count(PN))
4313          continue;
4314
4315      // Examine the stored values.
4316      if (StoreInst *ST = dyn_cast<StoreInst>(it))
4317        T = ST->getValueOperand()->getType();
4318
4319      // Ignore loaded pointer types and stored pointer types that are not
4320      // consecutive. However, we do want to take consecutive stores/loads of
4321      // pointer vectors into account.
4322      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
4323        continue;
4324
4325      MaxWidth = std::max(MaxWidth,
4326                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
4327    }
4328  }
4329
4330  return MaxWidth;
4331}
4332
4333unsigned
4334LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
4335                                               unsigned UserUF,
4336                                               unsigned VF,
4337                                               unsigned LoopCost) {
4338
4339  // -- The unroll heuristics --
4340  // We unroll the loop in order to expose ILP and reduce the loop overhead.
4341  // There are many micro-architectural considerations that we can't predict
4342  // at this level. For example frontend pressure (on decode or fetch) due to
4343  // code size, or the number and capabilities of the execution ports.
4344  //
4345  // We use the following heuristics to select the unroll factor:
4346  // 1. If the code has reductions the we unroll in order to break the cross
4347  // iteration dependency.
4348  // 2. If the loop is really small then we unroll in order to reduce the loop
4349  // overhead.
4350  // 3. We don't unroll if we think that we will spill registers to memory due
4351  // to the increased register pressure.
4352
4353  // Use the user preference, unless 'auto' is selected.
4354  if (UserUF != 0)
4355    return UserUF;
4356
4357  // When we optimize for size we don't unroll.
4358  if (OptForSize)
4359    return 1;
4360
4361  // We used the distance for the unroll factor.
4362  if (Legal->getMaxSafeDepDistBytes() != -1U)
4363    return 1;
4364
4365  // Do not unroll loops with a relatively small trip count.
4366  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
4367                                              TheLoop->getLoopLatch());
4368  if (TC > 1 && TC < TinyTripCountUnrollThreshold)
4369    return 1;
4370
4371  unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
4372  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
4373        " vector registers\n");
4374
4375  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
4376  // We divide by these constants so assume that we have at least one
4377  // instruction that uses at least one register.
4378  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4379  R.NumInstructions = std::max(R.NumInstructions, 1U);
4380
4381  // We calculate the unroll factor using the following formula.
4382  // Subtract the number of loop invariants from the number of available
4383  // registers. These registers are used by all of the unrolled instances.
4384  // Next, divide the remaining registers by the number of registers that is
4385  // required by the loop, in order to estimate how many parallel instances
4386  // fit without causing spills.
4387  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
4388
4389  // Clamp the unroll factor ranges to reasonable factors.
4390  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
4391
4392  // If we did not calculate the cost for VF (because the user selected the VF)
4393  // then we calculate the cost of VF here.
4394  if (LoopCost == 0)
4395    LoopCost = expectedCost(VF);
4396
4397  // Clamp the calculated UF to be between the 1 and the max unroll factor
4398  // that the target allows.
4399  if (UF > MaxUnrollSize)
4400    UF = MaxUnrollSize;
4401  else if (UF < 1)
4402    UF = 1;
4403
4404  bool HasReductions = Legal->getReductionVars()->size();
4405
4406  // Decide if we want to unroll if we decided that it is legal to vectorize
4407  // but not profitable.
4408  if (VF == 1) {
4409    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
4410        LoopCost > SmallLoopCost)
4411      return 1;
4412
4413    return UF;
4414  }
4415
4416  if (HasReductions) {
4417    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
4418    return UF;
4419  }
4420
4421  // We want to unroll tiny loops in order to reduce the loop overhead.
4422  // We assume that the cost overhead is 1 and we use the cost model
4423  // to estimate the cost of the loop and unroll until the cost of the
4424  // loop overhead is about 5% of the cost of the loop.
4425  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
4426  if (LoopCost < SmallLoopCost) {
4427    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
4428    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
4429    return std::min(NewUF, UF);
4430  }
4431
4432  DEBUG(dbgs() << "LV: Not Unrolling. \n");
4433  return 1;
4434}
4435
4436LoopVectorizationCostModel::RegisterUsage
4437LoopVectorizationCostModel::calculateRegisterUsage() {
4438  // This function calculates the register usage by measuring the highest number
4439  // of values that are alive at a single location. Obviously, this is a very
4440  // rough estimation. We scan the loop in a topological order in order and
4441  // assign a number to each instruction. We use RPO to ensure that defs are
4442  // met before their users. We assume that each instruction that has in-loop
4443  // users starts an interval. We record every time that an in-loop value is
4444  // used, so we have a list of the first and last occurrences of each
4445  // instruction. Next, we transpose this data structure into a multi map that
4446  // holds the list of intervals that *end* at a specific location. This multi
4447  // map allows us to perform a linear search. We scan the instructions linearly
4448  // and record each time that a new interval starts, by placing it in a set.
4449  // If we find this value in the multi-map then we remove it from the set.
4450  // The max register usage is the maximum size of the set.
4451  // We also search for instructions that are defined outside the loop, but are
4452  // used inside the loop. We need this number separately from the max-interval
4453  // usage number because when we unroll, loop-invariant values do not take
4454  // more register.
4455  LoopBlocksDFS DFS(TheLoop);
4456  DFS.perform(LI);
4457
4458  RegisterUsage R;
4459  R.NumInstructions = 0;
4460
4461  // Each 'key' in the map opens a new interval. The values
4462  // of the map are the index of the 'last seen' usage of the
4463  // instruction that is the key.
4464  typedef DenseMap<Instruction*, unsigned> IntervalMap;
4465  // Maps instruction to its index.
4466  DenseMap<unsigned, Instruction*> IdxToInstr;
4467  // Marks the end of each interval.
4468  IntervalMap EndPoint;
4469  // Saves the list of instruction indices that are used in the loop.
4470  SmallSet<Instruction*, 8> Ends;
4471  // Saves the list of values that are used in the loop but are
4472  // defined outside the loop, such as arguments and constants.
4473  SmallPtrSet<Value*, 8> LoopInvariants;
4474
4475  unsigned Index = 0;
4476  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
4477       be = DFS.endRPO(); bb != be; ++bb) {
4478    R.NumInstructions += (*bb)->size();
4479    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
4480         ++it) {
4481      Instruction *I = it;
4482      IdxToInstr[Index++] = I;
4483
4484      // Save the end location of each USE.
4485      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
4486        Value *U = I->getOperand(i);
4487        Instruction *Instr = dyn_cast<Instruction>(U);
4488
4489        // Ignore non-instruction values such as arguments, constants, etc.
4490        if (!Instr) continue;
4491
4492        // If this instruction is outside the loop then record it and continue.
4493        if (!TheLoop->contains(Instr)) {
4494          LoopInvariants.insert(Instr);
4495          continue;
4496        }
4497
4498        // Overwrite previous end points.
4499        EndPoint[Instr] = Index;
4500        Ends.insert(Instr);
4501      }
4502    }
4503  }
4504
4505  // Saves the list of intervals that end with the index in 'key'.
4506  typedef SmallVector<Instruction*, 2> InstrList;
4507  DenseMap<unsigned, InstrList> TransposeEnds;
4508
4509  // Transpose the EndPoints to a list of values that end at each index.
4510  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
4511       it != e; ++it)
4512    TransposeEnds[it->second].push_back(it->first);
4513
4514  SmallSet<Instruction*, 8> OpenIntervals;
4515  unsigned MaxUsage = 0;
4516
4517
4518  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4519  for (unsigned int i = 0; i < Index; ++i) {
4520    Instruction *I = IdxToInstr[i];
4521    // Ignore instructions that are never used within the loop.
4522    if (!Ends.count(I)) continue;
4523
4524    // Remove all of the instructions that end at this location.
4525    InstrList &List = TransposeEnds[i];
4526    for (unsigned int j=0, e = List.size(); j < e; ++j)
4527      OpenIntervals.erase(List[j]);
4528
4529    // Count the number of live interals.
4530    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
4531
4532    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
4533          OpenIntervals.size() <<"\n");
4534
4535    // Add the current instruction to the list of open intervals.
4536    OpenIntervals.insert(I);
4537  }
4538
4539  unsigned Invariant = LoopInvariants.size();
4540  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
4541  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
4542  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
4543
4544  R.LoopInvariantRegs = Invariant;
4545  R.MaxLocalUsers = MaxUsage;
4546  return R;
4547}
4548
4549unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
4550  unsigned Cost = 0;
4551
4552  // For each block.
4553  for (Loop::block_iterator bb = TheLoop->block_begin(),
4554       be = TheLoop->block_end(); bb != be; ++bb) {
4555    unsigned BlockCost = 0;
4556    BasicBlock *BB = *bb;
4557
4558    // For each instruction in the old loop.
4559    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4560      // Skip dbg intrinsics.
4561      if (isa<DbgInfoIntrinsic>(it))
4562        continue;
4563
4564      unsigned C = getInstructionCost(it, VF);
4565      BlockCost += C;
4566      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
4567            VF << " For instruction: "<< *it << "\n");
4568    }
4569
4570    // We assume that if-converted blocks have a 50% chance of being executed.
4571    // When the code is scalar then some of the blocks are avoided due to CF.
4572    // When the code is vectorized we execute all code paths.
4573    if (VF == 1 && Legal->blockNeedsPredication(*bb))
4574      BlockCost /= 2;
4575
4576    Cost += BlockCost;
4577  }
4578
4579  return Cost;
4580}
4581
4582/// \brief Check whether the address computation for a non-consecutive memory
4583/// access looks like an unlikely candidate for being merged into the indexing
4584/// mode.
4585///
4586/// We look for a GEP which has one index that is an induction variable and all
4587/// other indices are loop invariant. If the stride of this access is also
4588/// within a small bound we decide that this address computation can likely be
4589/// merged into the addressing mode.
4590/// In all other cases, we identify the address computation as complex.
4591static bool isLikelyComplexAddressComputation(Value *Ptr,
4592                                              LoopVectorizationLegality *Legal,
4593                                              ScalarEvolution *SE,
4594                                              const Loop *TheLoop) {
4595  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
4596  if (!Gep)
4597    return true;
4598
4599  // We are looking for a gep with all loop invariant indices except for one
4600  // which should be an induction variable.
4601  unsigned NumOperands = Gep->getNumOperands();
4602  for (unsigned i = 1; i < NumOperands; ++i) {
4603    Value *Opd = Gep->getOperand(i);
4604    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
4605        !Legal->isInductionVariable(Opd))
4606      return true;
4607  }
4608
4609  // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
4610  // can likely be merged into the address computation.
4611  unsigned MaxMergeDistance = 64;
4612
4613  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
4614  if (!AddRec)
4615    return true;
4616
4617  // Check the step is constant.
4618  const SCEV *Step = AddRec->getStepRecurrence(*SE);
4619  // Calculate the pointer stride and check if it is consecutive.
4620  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4621  if (!C)
4622    return true;
4623
4624  const APInt &APStepVal = C->getValue()->getValue();
4625
4626  // Huge step value - give up.
4627  if (APStepVal.getBitWidth() > 64)
4628    return true;
4629
4630  int64_t StepVal = APStepVal.getSExtValue();
4631
4632  return StepVal > MaxMergeDistance;
4633}
4634
4635unsigned
4636LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
4637  // If we know that this instruction will remain uniform, check the cost of
4638  // the scalar version.
4639  if (Legal->isUniformAfterVectorization(I))
4640    VF = 1;
4641
4642  Type *RetTy = I->getType();
4643  Type *VectorTy = ToVectorTy(RetTy, VF);
4644
4645  // TODO: We need to estimate the cost of intrinsic calls.
4646  switch (I->getOpcode()) {
4647  case Instruction::GetElementPtr:
4648    // We mark this instruction as zero-cost because the cost of GEPs in
4649    // vectorized code depends on whether the corresponding memory instruction
4650    // is scalarized or not. Therefore, we handle GEPs with the memory
4651    // instruction cost.
4652    return 0;
4653  case Instruction::Br: {
4654    return TTI.getCFInstrCost(I->getOpcode());
4655  }
4656  case Instruction::PHI:
4657    //TODO: IF-converted IFs become selects.
4658    return 0;
4659  case Instruction::Add:
4660  case Instruction::FAdd:
4661  case Instruction::Sub:
4662  case Instruction::FSub:
4663  case Instruction::Mul:
4664  case Instruction::FMul:
4665  case Instruction::UDiv:
4666  case Instruction::SDiv:
4667  case Instruction::FDiv:
4668  case Instruction::URem:
4669  case Instruction::SRem:
4670  case Instruction::FRem:
4671  case Instruction::Shl:
4672  case Instruction::LShr:
4673  case Instruction::AShr:
4674  case Instruction::And:
4675  case Instruction::Or:
4676  case Instruction::Xor: {
4677    // Certain instructions can be cheaper to vectorize if they have a constant
4678    // second vector operand. One example of this are shifts on x86.
4679    TargetTransformInfo::OperandValueKind Op1VK =
4680      TargetTransformInfo::OK_AnyValue;
4681    TargetTransformInfo::OperandValueKind Op2VK =
4682      TargetTransformInfo::OK_AnyValue;
4683
4684    if (isa<ConstantInt>(I->getOperand(1)))
4685      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
4686
4687    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
4688  }
4689  case Instruction::Select: {
4690    SelectInst *SI = cast<SelectInst>(I);
4691    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
4692    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
4693    Type *CondTy = SI->getCondition()->getType();
4694    if (!ScalarCond)
4695      CondTy = VectorType::get(CondTy, VF);
4696
4697    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
4698  }
4699  case Instruction::ICmp:
4700  case Instruction::FCmp: {
4701    Type *ValTy = I->getOperand(0)->getType();
4702    VectorTy = ToVectorTy(ValTy, VF);
4703    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
4704  }
4705  case Instruction::Store:
4706  case Instruction::Load: {
4707    StoreInst *SI = dyn_cast<StoreInst>(I);
4708    LoadInst *LI = dyn_cast<LoadInst>(I);
4709    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
4710                   LI->getType());
4711    VectorTy = ToVectorTy(ValTy, VF);
4712
4713    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
4714    unsigned AS = SI ? SI->getPointerAddressSpace() :
4715      LI->getPointerAddressSpace();
4716    Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
4717    // We add the cost of address computation here instead of with the gep
4718    // instruction because only here we know whether the operation is
4719    // scalarized.
4720    if (VF == 1)
4721      return TTI.getAddressComputationCost(VectorTy) +
4722        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
4723
4724    // Scalarized loads/stores.
4725    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
4726    bool Reverse = ConsecutiveStride < 0;
4727    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
4728    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
4729    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
4730      bool IsComplexComputation =
4731        isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
4732      unsigned Cost = 0;
4733      // The cost of extracting from the value vector and pointer vector.
4734      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
4735      for (unsigned i = 0; i < VF; ++i) {
4736        //  The cost of extracting the pointer operand.
4737        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
4738        // In case of STORE, the cost of ExtractElement from the vector.
4739        // In case of LOAD, the cost of InsertElement into the returned
4740        // vector.
4741        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
4742                                            Instruction::InsertElement,
4743                                            VectorTy, i);
4744      }
4745
4746      // The cost of the scalar loads/stores.
4747      Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
4748      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
4749                                       Alignment, AS);
4750      return Cost;
4751    }
4752
4753    // Wide load/stores.
4754    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
4755    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
4756
4757    if (Reverse)
4758      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4759                                  VectorTy, 0);
4760    return Cost;
4761  }
4762  case Instruction::ZExt:
4763  case Instruction::SExt:
4764  case Instruction::FPToUI:
4765  case Instruction::FPToSI:
4766  case Instruction::FPExt:
4767  case Instruction::PtrToInt:
4768  case Instruction::IntToPtr:
4769  case Instruction::SIToFP:
4770  case Instruction::UIToFP:
4771  case Instruction::Trunc:
4772  case Instruction::FPTrunc:
4773  case Instruction::BitCast: {
4774    // We optimize the truncation of induction variable.
4775    // The cost of these is the same as the scalar operation.
4776    if (I->getOpcode() == Instruction::Trunc &&
4777        Legal->isInductionVariable(I->getOperand(0)))
4778      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
4779                                  I->getOperand(0)->getType());
4780
4781    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
4782    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
4783  }
4784  case Instruction::Call: {
4785    CallInst *CI = cast<CallInst>(I);
4786    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
4787    assert(ID && "Not an intrinsic call!");
4788    Type *RetTy = ToVectorTy(CI->getType(), VF);
4789    SmallVector<Type*, 4> Tys;
4790    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
4791      Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
4792    return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
4793  }
4794  default: {
4795    // We are scalarizing the instruction. Return the cost of the scalar
4796    // instruction, plus the cost of insert and extract into vector
4797    // elements, times the vector width.
4798    unsigned Cost = 0;
4799
4800    if (!RetTy->isVoidTy() && VF != 1) {
4801      unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
4802                                                VectorTy);
4803      unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
4804                                                VectorTy);
4805
4806      // The cost of inserting the results plus extracting each one of the
4807      // operands.
4808      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
4809    }
4810
4811    // The cost of executing VF copies of the scalar instruction. This opcode
4812    // is unknown. Assume that it is the same as 'mul'.
4813    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
4814    return Cost;
4815  }
4816  }// end of switch.
4817}
4818
4819Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
4820  if (Scalar->isVoidTy() || VF == 1)
4821    return Scalar;
4822  return VectorType::get(Scalar, VF);
4823}
4824
4825char LoopVectorize::ID = 0;
4826static const char lv_name[] = "Loop Vectorization";
4827INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
4828INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
4829INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4830INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4831INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
4832
4833namespace llvm {
4834  Pass *createLoopVectorizePass(bool NoUnrolling) {
4835    return new LoopVectorize(NoUnrolling);
4836  }
4837}
4838
4839bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
4840  // Check for a store.
4841  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
4842    return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
4843
4844  // Check for a load.
4845  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
4846    return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
4847
4848  return false;
4849}
4850
4851
4852void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
4853  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
4854  // Holds vector parameters or scalars, in case of uniform vals.
4855  SmallVector<VectorParts, 4> Params;
4856
4857  setDebugLocFromInst(Builder, Instr);
4858
4859  // Find all of the vectorized parameters.
4860  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
4861    Value *SrcOp = Instr->getOperand(op);
4862
4863    // If we are accessing the old induction variable, use the new one.
4864    if (SrcOp == OldInduction) {
4865      Params.push_back(getVectorValue(SrcOp));
4866      continue;
4867    }
4868
4869    // Try using previously calculated values.
4870    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
4871
4872    // If the src is an instruction that appeared earlier in the basic block
4873    // then it should already be vectorized.
4874    if (SrcInst && OrigLoop->contains(SrcInst)) {
4875      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
4876      // The parameter is a vector value from earlier.
4877      Params.push_back(WidenMap.get(SrcInst));
4878    } else {
4879      // The parameter is a scalar from outside the loop. Maybe even a constant.
4880      VectorParts Scalars;
4881      Scalars.append(UF, SrcOp);
4882      Params.push_back(Scalars);
4883    }
4884  }
4885
4886  assert(Params.size() == Instr->getNumOperands() &&
4887         "Invalid number of operands");
4888
4889  // Does this instruction return a value ?
4890  bool IsVoidRetTy = Instr->getType()->isVoidTy();
4891
4892  Value *UndefVec = IsVoidRetTy ? 0 :
4893  UndefValue::get(Instr->getType());
4894  // Create a new entry in the WidenMap and initialize it to Undef or Null.
4895  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
4896
4897  // For each vector unroll 'part':
4898  for (unsigned Part = 0; Part < UF; ++Part) {
4899    // For each scalar that we create:
4900
4901    Instruction *Cloned = Instr->clone();
4902      if (!IsVoidRetTy)
4903        Cloned->setName(Instr->getName() + ".cloned");
4904      // Replace the operands of the cloned instructions with extracted scalars.
4905      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
4906        Value *Op = Params[op][Part];
4907        Cloned->setOperand(op, Op);
4908      }
4909
4910      // Place the cloned scalar in the new loop.
4911      Builder.Insert(Cloned);
4912
4913      // If the original scalar returns a value we need to place it in a vector
4914      // so that future users will be able to use it.
4915      if (!IsVoidRetTy)
4916        VecResults[Part] = Cloned;
4917  }
4918}
4919
4920void
4921InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
4922                                              LoopVectorizationLegality*) {
4923  return scalarizeInstruction(Instr);
4924}
4925
4926Value *InnerLoopUnroller::reverseVector(Value *Vec) {
4927  return Vec;
4928}
4929
4930Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
4931  return V;
4932}
4933
4934Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
4935                                               bool Negate) {
4936  // When unrolling and the VF is 1, we only need to add a simple scalar.
4937  Type *ITy = Val->getType();
4938  assert(!ITy->isVectorTy() && "Val must be a scalar");
4939  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
4940  return Builder.CreateAdd(Val, C, "induction");
4941}
4942
4943