LoopVectorize.cpp revision 96a6555b5706af59e408bb190c3685f0f15bc2a9
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. Legalization of the IR is done 12// in the codegen. However, the vectorizes uses (will use) the codegen 13// interfaces to generate IR that is likely to result in an optimal binary. 14// 15// The loop vectorizer combines consecutive loop iteration into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. InnerLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28// 29//===----------------------------------------------------------------------===// 30// 31// The reduction-variable vectorization is based on the paper: 32// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33// 34// Variable uniformity checks are inspired by: 35// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36// 37// Other ideas/concepts are from: 38// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39// 40// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41// Vectorizing Compilers. 42// 43//===----------------------------------------------------------------------===// 44 45#define LV_NAME "loop-vectorize" 46#define DEBUG_TYPE LV_NAME 47 48#include "llvm/Transforms/Vectorize.h" 49#include "llvm/ADT/DenseMap.h" 50#include "llvm/ADT/MapVector.h" 51#include "llvm/ADT/SmallPtrSet.h" 52#include "llvm/ADT/SmallSet.h" 53#include "llvm/ADT/SmallVector.h" 54#include "llvm/ADT/StringExtras.h" 55#include "llvm/Analysis/AliasAnalysis.h" 56#include "llvm/Analysis/AliasSetTracker.h" 57#include "llvm/Analysis/Dominators.h" 58#include "llvm/Analysis/LoopInfo.h" 59#include "llvm/Analysis/LoopIterator.h" 60#include "llvm/Analysis/LoopPass.h" 61#include "llvm/Analysis/ScalarEvolution.h" 62#include "llvm/Analysis/ScalarEvolutionExpander.h" 63#include "llvm/Analysis/ScalarEvolutionExpressions.h" 64#include "llvm/Analysis/TargetTransformInfo.h" 65#include "llvm/Analysis/ValueTracking.h" 66#include "llvm/Analysis/Verifier.h" 67#include "llvm/IR/Constants.h" 68#include "llvm/IR/DataLayout.h" 69#include "llvm/IR/DerivedTypes.h" 70#include "llvm/IR/Function.h" 71#include "llvm/IR/IRBuilder.h" 72#include "llvm/IR/Instructions.h" 73#include "llvm/IR/IntrinsicInst.h" 74#include "llvm/IR/LLVMContext.h" 75#include "llvm/IR/Module.h" 76#include "llvm/IR/Type.h" 77#include "llvm/IR/Value.h" 78#include "llvm/Pass.h" 79#include "llvm/Support/CommandLine.h" 80#include "llvm/Support/Debug.h" 81#include "llvm/Support/raw_ostream.h" 82#include "llvm/Transforms/Scalar.h" 83#include "llvm/Transforms/Utils/BasicBlockUtils.h" 84#include "llvm/Transforms/Utils/Local.h" 85#include <algorithm> 86#include <map> 87 88using namespace llvm; 89 90static cl::opt<unsigned> 91VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 92 cl::desc("Sets the SIMD width. Zero is autoselect.")); 93 94static cl::opt<unsigned> 95VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, 96 cl::desc("Sets the vectorization unroll count. " 97 "Zero is autoselect.")); 98 99static cl::opt<bool> 100EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 101 cl::desc("Enable if-conversion during vectorization.")); 102 103/// We don't vectorize loops with a known constant trip count below this number. 104static cl::opt<unsigned> 105TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, 106 cl::desc("The minimum trip count in the loops to vectorize.")); 107 108/// We don't unroll loops with a known constant trip count below this number. 109static const unsigned TinyTripCountUnrollThreshold = 128; 110 111/// When performing a runtime memory check, do not check more than this 112/// number of pointers. Notice that the check is quadratic! 113static const unsigned RuntimeMemoryCheckThreshold = 4; 114 115namespace { 116 117// Forward declarations. 118class LoopVectorizationLegality; 119class LoopVectorizationCostModel; 120 121/// InnerLoopVectorizer vectorizes loops which contain only one basic 122/// block to a specified vectorization factor (VF). 123/// This class performs the widening of scalars into vectors, or multiple 124/// scalars. This class also implements the following features: 125/// * It inserts an epilogue loop for handling loops that don't have iteration 126/// counts that are known to be a multiple of the vectorization factor. 127/// * It handles the code generation for reduction variables. 128/// * Scalarization (implementation using scalars) of un-vectorizable 129/// instructions. 130/// InnerLoopVectorizer does not perform any vectorization-legality 131/// checks, and relies on the caller to check for the different legality 132/// aspects. The InnerLoopVectorizer relies on the 133/// LoopVectorizationLegality class to provide information about the induction 134/// and reduction variables that were found to a given vectorization factor. 135class InnerLoopVectorizer { 136public: 137 InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 138 DominatorTree *DT, DataLayout *DL, unsigned VecWidth, 139 unsigned UnrollFactor) 140 : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth), 141 UF(UnrollFactor), Builder(SE->getContext()), Induction(0), 142 OldInduction(0), WidenMap(UnrollFactor) {} 143 144 // Perform the actual loop widening (vectorization). 145 void vectorize(LoopVectorizationLegality *Legal) { 146 // Create a new empty loop. Unlink the old loop and connect the new one. 147 createEmptyLoop(Legal); 148 // Widen each instruction in the old loop to a new one in the new loop. 149 // Use the Legality module to find the induction and reduction variables. 150 vectorizeLoop(Legal); 151 // Register the new loop and update the analysis passes. 152 updateAnalysis(); 153 } 154 155private: 156 /// A small list of PHINodes. 157 typedef SmallVector<PHINode*, 4> PhiVector; 158 /// When we unroll loops we have multiple vector values for each scalar. 159 /// This data structure holds the unrolled and vectorized values that 160 /// originated from one scalar instruction. 161 typedef SmallVector<Value*, 2> VectorParts; 162 163 /// Add code that checks at runtime if the accessed arrays overlap. 164 /// Returns the comparator value or NULL if no check is needed. 165 Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, 166 Instruction *Loc); 167 /// Create an empty loop, based on the loop ranges of the old loop. 168 void createEmptyLoop(LoopVectorizationLegality *Legal); 169 /// Copy and widen the instructions from the old loop. 170 void vectorizeLoop(LoopVectorizationLegality *Legal); 171 172 /// A helper function that computes the predicate of the block BB, assuming 173 /// that the header block of the loop is set to True. It returns the *entry* 174 /// mask for the block BB. 175 VectorParts createBlockInMask(BasicBlock *BB); 176 /// A helper function that computes the predicate of the edge between SRC 177 /// and DST. 178 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 179 180 /// A helper function to vectorize a single BB within the innermost loop. 181 void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, 182 PhiVector *PV); 183 184 /// Insert the new loop to the loop hierarchy and pass manager 185 /// and update the analysis passes. 186 void updateAnalysis(); 187 188 /// This instruction is un-vectorizable. Implement it as a sequence 189 /// of scalars. 190 void scalarizeInstruction(Instruction *Instr); 191 192 /// Vectorize Load and Store instructions, 193 void vectorizeMemoryInstruction(Instruction *Instr, 194 LoopVectorizationLegality *Legal); 195 196 /// Create a broadcast instruction. This method generates a broadcast 197 /// instruction (shuffle) for loop invariant values and for the induction 198 /// value. If this is the induction variable then we extend it to N, N+1, ... 199 /// this is needed because each iteration in the loop corresponds to a SIMD 200 /// element. 201 Value *getBroadcastInstrs(Value *V); 202 203 /// This function adds 0, 1, 2 ... to each vector element, starting at zero. 204 /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). 205 /// The sequence starts at StartIndex. 206 Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); 207 208 /// When we go over instructions in the basic block we rely on previous 209 /// values within the current basic block or on loop invariant values. 210 /// When we widen (vectorize) values we place them in the map. If the values 211 /// are not within the map, they have to be loop invariant, so we simply 212 /// broadcast them into a vector. 213 VectorParts &getVectorValue(Value *V); 214 215 /// Generate a shuffle sequence that will reverse the vector Vec. 216 Value *reverseVector(Value *Vec); 217 218 /// This is a helper class that holds the vectorizer state. It maps scalar 219 /// instructions to vector instructions. When the code is 'unrolled' then 220 /// then a single scalar value is mapped to multiple vector parts. The parts 221 /// are stored in the VectorPart type. 222 struct ValueMap { 223 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 224 /// are mapped. 225 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 226 227 /// \return True if 'Key' is saved in the Value Map. 228 bool has(Value *Key) const { return MapStorage.count(Key); } 229 230 /// Initializes a new entry in the map. Sets all of the vector parts to the 231 /// save value in 'Val'. 232 /// \return A reference to a vector with splat values. 233 VectorParts &splat(Value *Key, Value *Val) { 234 VectorParts &Entry = MapStorage[Key]; 235 Entry.assign(UF, Val); 236 return Entry; 237 } 238 239 ///\return A reference to the value that is stored at 'Key'. 240 VectorParts &get(Value *Key) { 241 VectorParts &Entry = MapStorage[Key]; 242 if (Entry.empty()) 243 Entry.resize(UF); 244 assert(Entry.size() == UF); 245 return Entry; 246 } 247 248 private: 249 /// The unroll factor. Each entry in the map stores this number of vector 250 /// elements. 251 unsigned UF; 252 253 /// Map storage. We use std::map and not DenseMap because insertions to a 254 /// dense map invalidates its iterators. 255 std::map<Value *, VectorParts> MapStorage; 256 }; 257 258 /// The original loop. 259 Loop *OrigLoop; 260 /// Scev analysis to use. 261 ScalarEvolution *SE; 262 /// Loop Info. 263 LoopInfo *LI; 264 /// Dominator Tree. 265 DominatorTree *DT; 266 /// Data Layout. 267 DataLayout *DL; 268 /// The vectorization SIMD factor to use. Each vector will have this many 269 /// vector elements. 270 unsigned VF; 271 /// The vectorization unroll factor to use. Each scalar is vectorized to this 272 /// many different vector instructions. 273 unsigned UF; 274 275 /// The builder that we use 276 IRBuilder<> Builder; 277 278 // --- Vectorization state --- 279 280 /// The vector-loop preheader. 281 BasicBlock *LoopVectorPreHeader; 282 /// The scalar-loop preheader. 283 BasicBlock *LoopScalarPreHeader; 284 /// Middle Block between the vector and the scalar. 285 BasicBlock *LoopMiddleBlock; 286 ///The ExitBlock of the scalar loop. 287 BasicBlock *LoopExitBlock; 288 ///The vector loop body. 289 BasicBlock *LoopVectorBody; 290 ///The scalar loop body. 291 BasicBlock *LoopScalarBody; 292 /// A list of all bypass blocks. The first block is the entry of the loop. 293 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 294 295 /// The new Induction variable which was added to the new block. 296 PHINode *Induction; 297 /// The induction variable of the old basic block. 298 PHINode *OldInduction; 299 /// Maps scalars to widened vectors. 300 ValueMap WidenMap; 301}; 302 303/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 304/// to what vectorization factor. 305/// This class does not look at the profitability of vectorization, only the 306/// legality. This class has two main kinds of checks: 307/// * Memory checks - The code in canVectorizeMemory checks if vectorization 308/// will change the order of memory accesses in a way that will change the 309/// correctness of the program. 310/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 311/// checks for a number of different conditions, such as the availability of a 312/// single induction variable, that all types are supported and vectorize-able, 313/// etc. This code reflects the capabilities of InnerLoopVectorizer. 314/// This class is also used by InnerLoopVectorizer for identifying 315/// induction variable and the different reduction variables. 316class LoopVectorizationLegality { 317public: 318 LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, 319 DominatorTree *DT) 320 : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {} 321 322 /// This enum represents the kinds of reductions that we support. 323 enum ReductionKind { 324 RK_NoReduction, ///< Not a reduction. 325 RK_IntegerAdd, ///< Sum of integers. 326 RK_IntegerMult, ///< Product of integers. 327 RK_IntegerOr, ///< Bitwise or logical OR of numbers. 328 RK_IntegerAnd, ///< Bitwise or logical AND of numbers. 329 RK_IntegerXor, ///< Bitwise or logical XOR of numbers. 330 RK_FloatAdd, ///< Sum of floats. 331 RK_FloatMult ///< Product of floats. 332 }; 333 334 /// This enum represents the kinds of inductions that we support. 335 enum InductionKind { 336 IK_NoInduction, ///< Not an induction variable. 337 IK_IntInduction, ///< Integer induction variable. Step = 1. 338 IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. 339 IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). 340 IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). 341 }; 342 343 /// This POD struct holds information about reduction variables. 344 struct ReductionDescriptor { 345 ReductionDescriptor() : StartValue(0), LoopExitInstr(0), 346 Kind(RK_NoReduction) {} 347 348 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) 349 : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 350 351 // The starting value of the reduction. 352 // It does not have to be zero! 353 Value *StartValue; 354 // The instruction who's value is used outside the loop. 355 Instruction *LoopExitInstr; 356 // The kind of the reduction. 357 ReductionKind Kind; 358 }; 359 360 // This POD struct holds information about the memory runtime legality 361 // check that a group of pointers do not overlap. 362 struct RuntimePointerCheck { 363 RuntimePointerCheck() : Need(false) {} 364 365 /// Reset the state of the pointer runtime information. 366 void reset() { 367 Need = false; 368 Pointers.clear(); 369 Starts.clear(); 370 Ends.clear(); 371 } 372 373 /// Insert a pointer and calculate the start and end SCEVs. 374 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); 375 376 /// This flag indicates if we need to add the runtime check. 377 bool Need; 378 /// Holds the pointers that we need to check. 379 SmallVector<Value*, 2> Pointers; 380 /// Holds the pointer value at the beginning of the loop. 381 SmallVector<const SCEV*, 2> Starts; 382 /// Holds the pointer value at the end of the loop. 383 SmallVector<const SCEV*, 2> Ends; 384 }; 385 386 /// A POD for saving information about induction variables. 387 struct InductionInfo { 388 InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} 389 InductionInfo() : StartValue(0), IK(IK_NoInduction) {} 390 /// Start value. 391 Value *StartValue; 392 /// Induction kind. 393 InductionKind IK; 394 }; 395 396 /// ReductionList contains the reduction descriptors for all 397 /// of the reductions that were found in the loop. 398 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 399 400 /// InductionList saves induction variables and maps them to the 401 /// induction descriptor. 402 typedef MapVector<PHINode*, InductionInfo> InductionList; 403 404 /// Returns true if it is legal to vectorize this loop. 405 /// This does not mean that it is profitable to vectorize this 406 /// loop, only that it is legal to do so. 407 bool canVectorize(); 408 409 /// Returns the Induction variable. 410 PHINode *getInduction() { return Induction; } 411 412 /// Returns the reduction variables found in the loop. 413 ReductionList *getReductionVars() { return &Reductions; } 414 415 /// Returns the induction variables found in the loop. 416 InductionList *getInductionVars() { return &Inductions; } 417 418 /// Returns True if V is an induction variable in this loop. 419 bool isInductionVariable(const Value *V); 420 421 /// Return true if the block BB needs to be predicated in order for the loop 422 /// to be vectorized. 423 bool blockNeedsPredication(BasicBlock *BB); 424 425 /// Check if this pointer is consecutive when vectorizing. This happens 426 /// when the last index of the GEP is the induction variable, or that the 427 /// pointer itself is an induction variable. 428 /// This check allows us to vectorize A[idx] into a wide load/store. 429 /// Returns: 430 /// 0 - Stride is unknown or non consecutive. 431 /// 1 - Address is consecutive. 432 /// -1 - Address is consecutive, and decreasing. 433 int isConsecutivePtr(Value *Ptr); 434 435 /// Returns true if the value V is uniform within the loop. 436 bool isUniform(Value *V); 437 438 /// Returns true if this instruction will remain scalar after vectorization. 439 bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 440 441 /// Returns the information that we collected about runtime memory check. 442 RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } 443private: 444 /// Check if a single basic block loop is vectorizable. 445 /// At this point we know that this is a loop with a constant trip count 446 /// and we only need to check individual instructions. 447 bool canVectorizeInstrs(); 448 449 /// When we vectorize loops we may change the order in which 450 /// we read and write from memory. This method checks if it is 451 /// legal to vectorize the code, considering only memory constrains. 452 /// Returns true if the loop is vectorizable 453 bool canVectorizeMemory(); 454 455 /// Return true if we can vectorize this loop using the IF-conversion 456 /// transformation. 457 bool canVectorizeWithIfConvert(); 458 459 /// Collect the variables that need to stay uniform after vectorization. 460 void collectLoopUniforms(); 461 462 /// Return true if all of the instructions in the block can be speculatively 463 /// executed. 464 bool blockCanBePredicated(BasicBlock *BB); 465 466 /// Returns True, if 'Phi' is the kind of reduction variable for type 467 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 468 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 469 /// Returns true if the instruction I can be a reduction variable of type 470 /// 'Kind'. 471 bool isReductionInstr(Instruction *I, ReductionKind Kind); 472 /// Returns the induction kind of Phi. This function may return NoInduction 473 /// if the PHI is not an induction variable. 474 InductionKind isInductionVariable(PHINode *Phi); 475 /// Return true if can compute the address bounds of Ptr within the loop. 476 bool hasComputableBounds(Value *Ptr); 477 478 /// The loop that we evaluate. 479 Loop *TheLoop; 480 /// Scev analysis. 481 ScalarEvolution *SE; 482 /// DataLayout analysis. 483 DataLayout *DL; 484 // Dominators. 485 DominatorTree *DT; 486 487 // --- vectorization state --- // 488 489 /// Holds the integer induction variable. This is the counter of the 490 /// loop. 491 PHINode *Induction; 492 /// Holds the reduction variables. 493 ReductionList Reductions; 494 /// Holds all of the induction variables that we found in the loop. 495 /// Notice that inductions don't need to start at zero and that induction 496 /// variables can be pointers. 497 InductionList Inductions; 498 499 /// Allowed outside users. This holds the reduction 500 /// vars which can be accessed from outside the loop. 501 SmallPtrSet<Value*, 4> AllowedExit; 502 /// This set holds the variables which are known to be uniform after 503 /// vectorization. 504 SmallPtrSet<Instruction*, 4> Uniforms; 505 /// We need to check that all of the pointers in this list are disjoint 506 /// at runtime. 507 RuntimePointerCheck PtrRtCheck; 508}; 509 510/// LoopVectorizationCostModel - estimates the expected speedups due to 511/// vectorization. 512/// In many cases vectorization is not profitable. This can happen because of 513/// a number of reasons. In this class we mainly attempt to predict the 514/// expected speedup/slowdowns due to the supported instruction set. We use the 515/// TargetTransformInfo to query the different backends for the cost of 516/// different operations. 517class LoopVectorizationCostModel { 518public: 519 LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 520 LoopVectorizationLegality *Legal, 521 const TargetTransformInfo &TTI, 522 DataLayout *DL) 523 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {} 524 525 /// Information about vectorization costs 526 struct VectorizationFactor { 527 unsigned Width; // Vector width with best cost 528 unsigned Cost; // Cost of the loop with that width 529 }; 530 /// \return The most profitable vectorization factor and the cost of that VF. 531 /// This method checks every power of two up to VF. If UserVF is not ZERO 532 /// then this vectorization factor will be selected if vectorization is 533 /// possible. 534 VectorizationFactor selectVectorizationFactor(bool OptForSize, unsigned UserVF); 535 536 /// \return The size (in bits) of the widest type in the code that 537 /// needs to be vectorized. We ignore values that remain scalar such as 538 /// 64 bit loop indices. 539 unsigned getWidestType(); 540 541 /// \return The most profitable unroll factor. 542 /// If UserUF is non-zero then this method finds the best unroll-factor 543 /// based on register pressure and other parameters. 544 /// VF and LoopCost are the selected vectorization factor and the cost of the 545 /// selected VF. 546 unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, 547 unsigned LoopCost); 548 549 /// \brief A struct that represents some properties of the register usage 550 /// of a loop. 551 struct RegisterUsage { 552 /// Holds the number of loop invariant values that are used in the loop. 553 unsigned LoopInvariantRegs; 554 /// Holds the maximum number of concurrent live intervals in the loop. 555 unsigned MaxLocalUsers; 556 /// Holds the number of instructions in the loop. 557 unsigned NumInstructions; 558 }; 559 560 /// \return information about the register usage of the loop. 561 RegisterUsage calculateRegisterUsage(); 562 563private: 564 /// Returns the expected execution cost. The unit of the cost does 565 /// not matter because we use the 'cost' units to compare different 566 /// vector widths. The cost that is returned is *not* normalized by 567 /// the factor width. 568 unsigned expectedCost(unsigned VF); 569 570 /// Returns the execution time cost of an instruction for a given vector 571 /// width. Vector width of one means scalar. 572 unsigned getInstructionCost(Instruction *I, unsigned VF); 573 574 /// A helper function for converting Scalar types to vector types. 575 /// If the incoming type is void, we return void. If the VF is 1, we return 576 /// the scalar type. 577 static Type* ToVectorTy(Type *Scalar, unsigned VF); 578 579 /// Returns whether the instruction is a load or store and will be a emitted 580 /// as a vector operation. 581 bool isConsecutiveLoadOrStore(Instruction *I); 582 583 /// The loop that we evaluate. 584 Loop *TheLoop; 585 /// Scev analysis. 586 ScalarEvolution *SE; 587 /// Loop Info analysis. 588 LoopInfo *LI; 589 /// Vectorization legality. 590 LoopVectorizationLegality *Legal; 591 /// Vector target information. 592 const TargetTransformInfo &TTI; 593 /// Target data layout information. 594 DataLayout *DL; 595}; 596 597/// The LoopVectorize Pass. 598struct LoopVectorize : public LoopPass { 599 /// Pass identification, replacement for typeid 600 static char ID; 601 602 explicit LoopVectorize() : LoopPass(ID) { 603 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 604 } 605 606 ScalarEvolution *SE; 607 DataLayout *DL; 608 LoopInfo *LI; 609 TargetTransformInfo *TTI; 610 DominatorTree *DT; 611 612 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 613 // We only vectorize innermost loops. 614 if (!L->empty()) 615 return false; 616 617 SE = &getAnalysis<ScalarEvolution>(); 618 DL = getAnalysisIfAvailable<DataLayout>(); 619 LI = &getAnalysis<LoopInfo>(); 620 TTI = &getAnalysis<TargetTransformInfo>(); 621 DT = &getAnalysis<DominatorTree>(); 622 623 DEBUG(dbgs() << "LV: Checking a loop in \"" << 624 L->getHeader()->getParent()->getName() << "\"\n"); 625 626 // Check if it is legal to vectorize the loop. 627 LoopVectorizationLegality LVL(L, SE, DL, DT); 628 if (!LVL.canVectorize()) { 629 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 630 return false; 631 } 632 633 // Use the cost model. 634 LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL); 635 636 // Check the function attribues to find out if this function should be 637 // optimized for size. 638 Function *F = L->getHeader()->getParent(); 639 Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; 640 Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; 641 unsigned FnIndex = AttributeSet::FunctionIndex; 642 bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); 643 bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); 644 645 if (NoFloat) { 646 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 647 "attribute is used.\n"); 648 return false; 649 } 650 651 // Select the optimal vectorization factor. 652 LoopVectorizationCostModel::VectorizationFactor VF; 653 VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 654 // Select the unroll factor. 655 unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, 656 VF.Width, VF.Cost); 657 658 if (VF.Width == 1) { 659 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 660 return false; 661 } 662 663 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< 664 F->getParent()->getModuleIdentifier()<<"\n"); 665 DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); 666 667 // If we decided that it is *legal* to vectorizer the loop then do it. 668 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF); 669 LB.vectorize(&LVL); 670 671 DEBUG(verifyFunction(*L->getHeader()->getParent())); 672 return true; 673 } 674 675 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 676 LoopPass::getAnalysisUsage(AU); 677 AU.addRequiredID(LoopSimplifyID); 678 AU.addRequiredID(LCSSAID); 679 AU.addRequired<DominatorTree>(); 680 AU.addRequired<LoopInfo>(); 681 AU.addRequired<ScalarEvolution>(); 682 AU.addRequired<TargetTransformInfo>(); 683 AU.addPreserved<LoopInfo>(); 684 AU.addPreserved<DominatorTree>(); 685 } 686 687}; 688 689} // end anonymous namespace 690 691//===----------------------------------------------------------------------===// 692// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 693// LoopVectorizationCostModel. 694//===----------------------------------------------------------------------===// 695 696void 697LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 698 Loop *Lp, Value *Ptr) { 699 const SCEV *Sc = SE->getSCEV(Ptr); 700 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 701 assert(AR && "Invalid addrec expression"); 702 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 703 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 704 Pointers.push_back(Ptr); 705 Starts.push_back(AR->getStart()); 706 Ends.push_back(ScEnd); 707} 708 709Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 710 // Save the current insertion location. 711 Instruction *Loc = Builder.GetInsertPoint(); 712 713 // We need to place the broadcast of invariant variables outside the loop. 714 Instruction *Instr = dyn_cast<Instruction>(V); 715 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 716 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 717 718 // Place the code for broadcasting invariant variables in the new preheader. 719 if (Invariant) 720 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 721 722 // Broadcast the scalar into all locations in the vector. 723 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 724 725 // Restore the builder insertion point. 726 if (Invariant) 727 Builder.SetInsertPoint(Loc); 728 729 return Shuf; 730} 731 732Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, 733 bool Negate) { 734 assert(Val->getType()->isVectorTy() && "Must be a vector"); 735 assert(Val->getType()->getScalarType()->isIntegerTy() && 736 "Elem must be an integer"); 737 // Create the types. 738 Type *ITy = Val->getType()->getScalarType(); 739 VectorType *Ty = cast<VectorType>(Val->getType()); 740 int VLen = Ty->getNumElements(); 741 SmallVector<Constant*, 8> Indices; 742 743 // Create a vector of consecutive numbers from zero to VF. 744 for (int i = 0; i < VLen; ++i) { 745 int Idx = Negate ? (-i): i; 746 Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); 747 } 748 749 // Add the consecutive indices to the vector value. 750 Constant *Cv = ConstantVector::get(Indices); 751 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 752 return Builder.CreateAdd(Val, Cv, "induction"); 753} 754 755int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 756 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 757 // Make sure that the pointer does not point to structs. 758 if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) 759 return 0; 760 761 // If this value is a pointer induction variable we know it is consecutive. 762 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 763 if (Phi && Inductions.count(Phi)) { 764 InductionInfo II = Inductions[Phi]; 765 if (IK_PtrInduction == II.IK) 766 return 1; 767 else if (IK_ReversePtrInduction == II.IK) 768 return -1; 769 } 770 771 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 772 if (!Gep) 773 return 0; 774 775 unsigned NumOperands = Gep->getNumOperands(); 776 Value *LastIndex = Gep->getOperand(NumOperands - 1); 777 778 Value *GpPtr = Gep->getPointerOperand(); 779 // If this GEP value is a consecutive pointer induction variable and all of 780 // the indices are constant then we know it is consecutive. We can 781 Phi = dyn_cast<PHINode>(GpPtr); 782 if (Phi && Inductions.count(Phi)) { 783 784 // Make sure that the pointer does not point to structs. 785 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 786 if (GepPtrType->getElementType()->isAggregateType()) 787 return 0; 788 789 // Make sure that all of the index operands are loop invariant. 790 for (unsigned i = 1; i < NumOperands; ++i) 791 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 792 return 0; 793 794 InductionInfo II = Inductions[Phi]; 795 if (IK_PtrInduction == II.IK) 796 return 1; 797 else if (IK_ReversePtrInduction == II.IK) 798 return -1; 799 } 800 801 // Check that all of the gep indices are uniform except for the last. 802 for (unsigned i = 0; i < NumOperands - 1; ++i) 803 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 804 return 0; 805 806 // We can emit wide load/stores only if the last index is the induction 807 // variable. 808 const SCEV *Last = SE->getSCEV(LastIndex); 809 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 810 const SCEV *Step = AR->getStepRecurrence(*SE); 811 812 // The memory is consecutive because the last index is consecutive 813 // and all other indices are loop invariant. 814 if (Step->isOne()) 815 return 1; 816 if (Step->isAllOnesValue()) 817 return -1; 818 } 819 820 return 0; 821} 822 823bool LoopVectorizationLegality::isUniform(Value *V) { 824 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 825} 826 827InnerLoopVectorizer::VectorParts& 828InnerLoopVectorizer::getVectorValue(Value *V) { 829 assert(V != Induction && "The new induction variable should not be used."); 830 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 831 832 // If we have this scalar in the map, return it. 833 if (WidenMap.has(V)) 834 return WidenMap.get(V); 835 836 // If this scalar is unknown, assume that it is a constant or that it is 837 // loop invariant. Broadcast V and save the value for future uses. 838 Value *B = getBroadcastInstrs(V); 839 return WidenMap.splat(V, B); 840} 841 842Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 843 assert(Vec->getType()->isVectorTy() && "Invalid type"); 844 SmallVector<Constant*, 8> ShuffleMask; 845 for (unsigned i = 0; i < VF; ++i) 846 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 847 848 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 849 ConstantVector::get(ShuffleMask), 850 "reverse"); 851} 852 853 854void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 855 LoopVectorizationLegality *Legal) { 856 // Attempt to issue a wide load. 857 LoadInst *LI = dyn_cast<LoadInst>(Instr); 858 StoreInst *SI = dyn_cast<StoreInst>(Instr); 859 860 assert((LI || SI) && "Invalid Load/Store instruction"); 861 862 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 863 Type *DataTy = VectorType::get(ScalarDataTy, VF); 864 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 865 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 866 867 // If the pointer is loop invariant or if it is non consecutive, 868 // scalarize the load. 869 int Stride = Legal->isConsecutivePtr(Ptr); 870 bool Reverse = Stride < 0; 871 bool UniformLoad = LI && Legal->isUniform(Ptr); 872 if (Stride == 0 || UniformLoad) 873 return scalarizeInstruction(Instr); 874 875 Constant *Zero = Builder.getInt32(0); 876 VectorParts &Entry = WidenMap.get(Instr); 877 878 // Handle consecutive loads/stores. 879 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 880 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 881 Value *PtrOperand = Gep->getPointerOperand(); 882 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 883 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 884 885 // Create the new GEP with the new induction variable. 886 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 887 Gep2->setOperand(0, FirstBasePtr); 888 Gep2->setName("gep.indvar.base"); 889 Ptr = Builder.Insert(Gep2); 890 } else if (Gep) { 891 assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), 892 OrigLoop) && "Base ptr must be invariant"); 893 894 // The last index does not have to be the induction. It can be 895 // consecutive and be a function of the index. For example A[I+1]; 896 unsigned NumOperands = Gep->getNumOperands(); 897 898 Value *LastGepOperand = Gep->getOperand(NumOperands - 1); 899 VectorParts &GEPParts = getVectorValue(LastGepOperand); 900 Value *LastIndex = GEPParts[0]; 901 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 902 903 // Create the new GEP with the new induction variable. 904 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 905 Gep2->setOperand(NumOperands - 1, LastIndex); 906 Gep2->setName("gep.indvar.idx"); 907 Ptr = Builder.Insert(Gep2); 908 } else { 909 // Use the induction element ptr. 910 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 911 VectorParts &PtrVal = getVectorValue(Ptr); 912 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 913 } 914 915 // Handle Stores: 916 if (SI) { 917 assert(!Legal->isUniform(SI->getPointerOperand()) && 918 "We do not allow storing to uniform addresses"); 919 920 VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); 921 for (unsigned Part = 0; Part < UF; ++Part) { 922 // Calculate the pointer for the specific unroll-part. 923 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 924 925 if (Reverse) { 926 // If we store to reverse consecutive memory locations then we need 927 // to reverse the order of elements in the stored value. 928 StoredVal[Part] = reverseVector(StoredVal[Part]); 929 // If the address is consecutive but reversed, then the 930 // wide store needs to start at the last vector element. 931 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 932 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 933 } 934 935 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 936 Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); 937 } 938 } 939 940 for (unsigned Part = 0; Part < UF; ++Part) { 941 // Calculate the pointer for the specific unroll-part. 942 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 943 944 if (Reverse) { 945 // If the address is consecutive but reversed, then the 946 // wide store needs to start at the last vector element. 947 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 948 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 949 } 950 951 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 952 Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); 953 cast<LoadInst>(LI)->setAlignment(Alignment); 954 Entry[Part] = Reverse ? reverseVector(LI) : LI; 955 } 956} 957 958void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 959 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 960 // Holds vector parameters or scalars, in case of uniform vals. 961 SmallVector<VectorParts, 4> Params; 962 963 // Find all of the vectorized parameters. 964 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 965 Value *SrcOp = Instr->getOperand(op); 966 967 // If we are accessing the old induction variable, use the new one. 968 if (SrcOp == OldInduction) { 969 Params.push_back(getVectorValue(SrcOp)); 970 continue; 971 } 972 973 // Try using previously calculated values. 974 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 975 976 // If the src is an instruction that appeared earlier in the basic block 977 // then it should already be vectorized. 978 if (SrcInst && OrigLoop->contains(SrcInst)) { 979 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 980 // The parameter is a vector value from earlier. 981 Params.push_back(WidenMap.get(SrcInst)); 982 } else { 983 // The parameter is a scalar from outside the loop. Maybe even a constant. 984 VectorParts Scalars; 985 Scalars.append(UF, SrcOp); 986 Params.push_back(Scalars); 987 } 988 } 989 990 assert(Params.size() == Instr->getNumOperands() && 991 "Invalid number of operands"); 992 993 // Does this instruction return a value ? 994 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 995 996 Value *UndefVec = IsVoidRetTy ? 0 : 997 UndefValue::get(VectorType::get(Instr->getType(), VF)); 998 // Create a new entry in the WidenMap and initialize it to Undef or Null. 999 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 1000 1001 // For each scalar that we create: 1002 for (unsigned Width = 0; Width < VF; ++Width) { 1003 // For each vector unroll 'part': 1004 for (unsigned Part = 0; Part < UF; ++Part) { 1005 Instruction *Cloned = Instr->clone(); 1006 if (!IsVoidRetTy) 1007 Cloned->setName(Instr->getName() + ".cloned"); 1008 // Replace the operands of the cloned instrucions with extracted scalars. 1009 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1010 Value *Op = Params[op][Part]; 1011 // Param is a vector. Need to extract the right lane. 1012 if (Op->getType()->isVectorTy()) 1013 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 1014 Cloned->setOperand(op, Op); 1015 } 1016 1017 // Place the cloned scalar in the new loop. 1018 Builder.Insert(Cloned); 1019 1020 // If the original scalar returns a value we need to place it in a vector 1021 // so that future users will be able to use it. 1022 if (!IsVoidRetTy) 1023 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 1024 Builder.getInt32(Width)); 1025 } 1026 } 1027} 1028 1029Instruction * 1030InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 1031 Instruction *Loc) { 1032 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 1033 Legal->getRuntimePointerCheck(); 1034 1035 if (!PtrRtCheck->Need) 1036 return NULL; 1037 1038 Instruction *MemoryRuntimeCheck = 0; 1039 unsigned NumPointers = PtrRtCheck->Pointers.size(); 1040 SmallVector<Value* , 2> Starts; 1041 SmallVector<Value* , 2> Ends; 1042 1043 SCEVExpander Exp(*SE, "induction"); 1044 1045 // Use this type for pointer arithmetic. 1046 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 1047 1048 for (unsigned i = 0; i < NumPointers; ++i) { 1049 Value *Ptr = PtrRtCheck->Pointers[i]; 1050 const SCEV *Sc = SE->getSCEV(Ptr); 1051 1052 if (SE->isLoopInvariant(Sc, OrigLoop)) { 1053 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 1054 *Ptr <<"\n"); 1055 Starts.push_back(Ptr); 1056 Ends.push_back(Ptr); 1057 } else { 1058 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 1059 1060 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 1061 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 1062 Starts.push_back(Start); 1063 Ends.push_back(End); 1064 } 1065 } 1066 1067 IRBuilder<> ChkBuilder(Loc); 1068 1069 for (unsigned i = 0; i < NumPointers; ++i) { 1070 for (unsigned j = i+1; j < NumPointers; ++j) { 1071 Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); 1072 Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); 1073 Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); 1074 Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); 1075 1076 Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); 1077 Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); 1078 Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 1079 if (MemoryRuntimeCheck) 1080 IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, 1081 "conflict.rdx"); 1082 1083 MemoryRuntimeCheck = cast<Instruction>(IsConflict); 1084 } 1085 } 1086 1087 return MemoryRuntimeCheck; 1088} 1089 1090void 1091InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 1092 /* 1093 In this function we generate a new loop. The new loop will contain 1094 the vectorized instructions while the old loop will continue to run the 1095 scalar remainder. 1096 1097 [ ] <-- vector loop bypass (may consist of multiple blocks). 1098 / | 1099 / v 1100 | [ ] <-- vector pre header. 1101 | | 1102 | v 1103 | [ ] \ 1104 | [ ]_| <-- vector loop. 1105 | | 1106 \ v 1107 >[ ] <--- middle-block. 1108 / | 1109 / v 1110 | [ ] <--- new preheader. 1111 | | 1112 | v 1113 | [ ] \ 1114 | [ ]_| <-- old scalar loop to handle remainder. 1115 \ | 1116 \ v 1117 >[ ] <-- exit block. 1118 ... 1119 */ 1120 1121 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 1122 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 1123 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 1124 assert(ExitBlock && "Must have an exit block"); 1125 1126 // Some loops have a single integer induction variable, while other loops 1127 // don't. One example is c++ iterators that often have multiple pointer 1128 // induction variables. In the code below we also support a case where we 1129 // don't have a single induction variable. 1130 OldInduction = Legal->getInduction(); 1131 Type *IdxTy = OldInduction ? OldInduction->getType() : 1132 DL->getIntPtrType(SE->getContext()); 1133 1134 // Find the loop boundaries. 1135 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 1136 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 1137 1138 // Get the total trip count from the count by adding 1. 1139 ExitCount = SE->getAddExpr(ExitCount, 1140 SE->getConstant(ExitCount->getType(), 1)); 1141 1142 // Expand the trip count and place the new instructions in the preheader. 1143 // Notice that the pre-header does not change, only the loop body. 1144 SCEVExpander Exp(*SE, "induction"); 1145 1146 // Count holds the overall loop count (N). 1147 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 1148 BypassBlock->getTerminator()); 1149 1150 // The loop index does not have to start at Zero. Find the original start 1151 // value from the induction PHI node. If we don't have an induction variable 1152 // then we know that it starts at zero. 1153 Value *StartIdx = OldInduction ? 1154 OldInduction->getIncomingValueForBlock(BypassBlock): 1155 ConstantInt::get(IdxTy, 0); 1156 1157 assert(BypassBlock && "Invalid loop structure"); 1158 LoopBypassBlocks.push_back(BypassBlock); 1159 1160 // Split the single block loop into the two loop structure described above. 1161 BasicBlock *VectorPH = 1162 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 1163 BasicBlock *VecBody = 1164 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 1165 BasicBlock *MiddleBlock = 1166 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 1167 BasicBlock *ScalarPH = 1168 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 1169 1170 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 1171 // inside the loop. 1172 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1173 1174 // Generate the induction variable. 1175 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 1176 // The loop step is equal to the vectorization factor (num of SIMD elements) 1177 // times the unroll factor (num of SIMD instructions). 1178 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 1179 1180 // This is the IR builder that we use to add all of the logic for bypassing 1181 // the new vector loop. 1182 IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); 1183 1184 // We may need to extend the index in case there is a type mismatch. 1185 // We know that the count starts at zero and does not overflow. 1186 if (Count->getType() != IdxTy) { 1187 // The exit count can be of pointer type. Convert it to the correct 1188 // integer type. 1189 if (ExitCount->getType()->isPointerTy()) 1190 Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); 1191 else 1192 Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); 1193 } 1194 1195 // Add the start index to the loop count to get the new end index. 1196 Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); 1197 1198 // Now we need to generate the expression for N - (N % VF), which is 1199 // the part that the vectorized body will execute. 1200 Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); 1201 Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); 1202 Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, 1203 "end.idx.rnd.down"); 1204 1205 // Now, compare the new count to zero. If it is zero skip the vector loop and 1206 // jump to the scalar loop. 1207 Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, 1208 "cmp.zero"); 1209 1210 BasicBlock *LastBypassBlock = BypassBlock; 1211 1212 // Generate the code that checks in runtime if arrays overlap. We put the 1213 // checks into a separate block to make the more common case of few elements 1214 // faster. 1215 Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, 1216 BypassBlock->getTerminator()); 1217 if (MemRuntimeCheck) { 1218 // Create a new block containing the memory check. 1219 BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, 1220 "vector.memcheck"); 1221 LoopBypassBlocks.push_back(CheckBlock); 1222 1223 // Replace the branch into the memory check block with a conditional branch 1224 // for the "few elements case". 1225 Instruction *OldTerm = BypassBlock->getTerminator(); 1226 BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); 1227 OldTerm->eraseFromParent(); 1228 1229 Cmp = MemRuntimeCheck; 1230 LastBypassBlock = CheckBlock; 1231 } 1232 1233 LastBypassBlock->getTerminator()->eraseFromParent(); 1234 BranchInst::Create(MiddleBlock, VectorPH, Cmp, 1235 LastBypassBlock); 1236 1237 // We are going to resume the execution of the scalar loop. 1238 // Go over all of the induction variables that we found and fix the 1239 // PHIs that are left in the scalar version of the loop. 1240 // The starting values of PHI nodes depend on the counter of the last 1241 // iteration in the vectorized loop. 1242 // If we come from a bypass edge then we need to start from the original 1243 // start value. 1244 1245 // This variable saves the new starting index for the scalar loop. 1246 PHINode *ResumeIndex = 0; 1247 LoopVectorizationLegality::InductionList::iterator I, E; 1248 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 1249 for (I = List->begin(), E = List->end(); I != E; ++I) { 1250 PHINode *OrigPhi = I->first; 1251 LoopVectorizationLegality::InductionInfo II = I->second; 1252 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 1253 MiddleBlock->getTerminator()); 1254 Value *EndValue = 0; 1255 switch (II.IK) { 1256 case LoopVectorizationLegality::IK_NoInduction: 1257 llvm_unreachable("Unknown induction"); 1258 case LoopVectorizationLegality::IK_IntInduction: { 1259 // Handle the integer induction counter: 1260 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 1261 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 1262 // We know what the end value is. 1263 EndValue = IdxEndRoundDown; 1264 // We also know which PHI node holds it. 1265 ResumeIndex = ResumeVal; 1266 break; 1267 } 1268 case LoopVectorizationLegality::IK_ReverseIntInduction: { 1269 // Convert the CountRoundDown variable to the PHI size. 1270 unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); 1271 unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); 1272 Value *CRD = CountRoundDown; 1273 if (CRDSize > IISize) 1274 CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, 1275 II.StartValue->getType(), "tr.crd", 1276 LoopBypassBlocks.back()->getTerminator()); 1277 else if (CRDSize < IISize) 1278 CRD = CastInst::Create(Instruction::SExt, CountRoundDown, 1279 II.StartValue->getType(), 1280 "sext.crd", 1281 LoopBypassBlocks.back()->getTerminator()); 1282 // Handle reverse integer induction counter: 1283 EndValue = 1284 BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", 1285 LoopBypassBlocks.back()->getTerminator()); 1286 break; 1287 } 1288 case LoopVectorizationLegality::IK_PtrInduction: { 1289 // For pointer induction variables, calculate the offset using 1290 // the end index. 1291 EndValue = 1292 GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", 1293 LoopBypassBlocks.back()->getTerminator()); 1294 break; 1295 } 1296 case LoopVectorizationLegality::IK_ReversePtrInduction: { 1297 // The value at the end of the loop for the reverse pointer is calculated 1298 // by creating a GEP with a negative index starting from the start value. 1299 Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); 1300 Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown, 1301 "rev.ind.end", 1302 LoopBypassBlocks.back()->getTerminator()); 1303 EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx, 1304 "rev.ptr.ind.end", 1305 LoopBypassBlocks.back()->getTerminator()); 1306 break; 1307 } 1308 }// end of case 1309 1310 // The new PHI merges the original incoming value, in case of a bypass, 1311 // or the value at the end of the vectorized loop. 1312 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1313 ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1314 ResumeVal->addIncoming(EndValue, VecBody); 1315 1316 // Fix the scalar body counter (PHI node). 1317 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 1318 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 1319 } 1320 1321 // If we are generating a new induction variable then we also need to 1322 // generate the code that calculates the exit value. This value is not 1323 // simply the end of the counter because we may skip the vectorized body 1324 // in case of a runtime check. 1325 if (!OldInduction){ 1326 assert(!ResumeIndex && "Unexpected resume value found"); 1327 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 1328 MiddleBlock->getTerminator()); 1329 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1330 ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); 1331 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 1332 } 1333 1334 // Make sure that we found the index where scalar loop needs to continue. 1335 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 1336 "Invalid resume Index"); 1337 1338 // Add a check in the middle block to see if we have completed 1339 // all of the iterations in the first vector loop. 1340 // If (N - N%VF) == N, then we *don't* need to run the remainder. 1341 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 1342 ResumeIndex, "cmp.n", 1343 MiddleBlock->getTerminator()); 1344 1345 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 1346 // Remove the old terminator. 1347 MiddleBlock->getTerminator()->eraseFromParent(); 1348 1349 // Create i+1 and fill the PHINode. 1350 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 1351 Induction->addIncoming(StartIdx, VectorPH); 1352 Induction->addIncoming(NextIdx, VecBody); 1353 // Create the compare. 1354 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 1355 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 1356 1357 // Now we have two terminators. Remove the old one from the block. 1358 VecBody->getTerminator()->eraseFromParent(); 1359 1360 // Get ready to start creating new instructions into the vectorized body. 1361 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1362 1363 // Create and register the new vector loop. 1364 Loop* Lp = new Loop(); 1365 Loop *ParentLoop = OrigLoop->getParentLoop(); 1366 1367 // Insert the new loop into the loop nest and register the new basic blocks. 1368 if (ParentLoop) { 1369 ParentLoop->addChildLoop(Lp); 1370 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 1371 ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); 1372 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1373 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1374 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1375 } else { 1376 LI->addTopLevelLoop(Lp); 1377 } 1378 1379 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1380 1381 // Save the state. 1382 LoopVectorPreHeader = VectorPH; 1383 LoopScalarPreHeader = ScalarPH; 1384 LoopMiddleBlock = MiddleBlock; 1385 LoopExitBlock = ExitBlock; 1386 LoopVectorBody = VecBody; 1387 LoopScalarBody = OldBasicBlock; 1388} 1389 1390/// This function returns the identity element (or neutral element) for 1391/// the operation K. 1392static Constant* 1393getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { 1394 switch (K) { 1395 case LoopVectorizationLegality:: RK_IntegerXor: 1396 case LoopVectorizationLegality:: RK_IntegerAdd: 1397 case LoopVectorizationLegality:: RK_IntegerOr: 1398 // Adding, Xoring, Oring zero to a number does not change it. 1399 return ConstantInt::get(Tp, 0); 1400 case LoopVectorizationLegality:: RK_IntegerMult: 1401 // Multiplying a number by 1 does not change it. 1402 return ConstantInt::get(Tp, 1); 1403 case LoopVectorizationLegality:: RK_IntegerAnd: 1404 // AND-ing a number with an all-1 value does not change it. 1405 return ConstantInt::get(Tp, -1, true); 1406 case LoopVectorizationLegality:: RK_FloatMult: 1407 // Multiplying a number by 1 does not change it. 1408 return ConstantFP::get(Tp, 1.0L); 1409 case LoopVectorizationLegality:: RK_FloatAdd: 1410 // Adding zero to a number does not change it. 1411 return ConstantFP::get(Tp, 0.0L); 1412 default: 1413 llvm_unreachable("Unknown reduction kind"); 1414 } 1415} 1416 1417static bool 1418isTriviallyVectorizableIntrinsic(Instruction *Inst) { 1419 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 1420 if (!II) 1421 return false; 1422 switch (II->getIntrinsicID()) { 1423 case Intrinsic::sqrt: 1424 case Intrinsic::sin: 1425 case Intrinsic::cos: 1426 case Intrinsic::exp: 1427 case Intrinsic::exp2: 1428 case Intrinsic::log: 1429 case Intrinsic::log10: 1430 case Intrinsic::log2: 1431 case Intrinsic::fabs: 1432 case Intrinsic::floor: 1433 case Intrinsic::ceil: 1434 case Intrinsic::trunc: 1435 case Intrinsic::rint: 1436 case Intrinsic::nearbyint: 1437 case Intrinsic::pow: 1438 case Intrinsic::fma: 1439 case Intrinsic::fmuladd: 1440 return true; 1441 default: 1442 return false; 1443 } 1444 return false; 1445} 1446 1447/// This function translates the reduction kind to an LLVM binary operator. 1448static Instruction::BinaryOps 1449getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { 1450 switch (Kind) { 1451 case LoopVectorizationLegality::RK_IntegerAdd: 1452 return Instruction::Add; 1453 case LoopVectorizationLegality::RK_IntegerMult: 1454 return Instruction::Mul; 1455 case LoopVectorizationLegality::RK_IntegerOr: 1456 return Instruction::Or; 1457 case LoopVectorizationLegality::RK_IntegerAnd: 1458 return Instruction::And; 1459 case LoopVectorizationLegality::RK_IntegerXor: 1460 return Instruction::Xor; 1461 case LoopVectorizationLegality::RK_FloatMult: 1462 return Instruction::FMul; 1463 case LoopVectorizationLegality::RK_FloatAdd: 1464 return Instruction::FAdd; 1465 default: 1466 llvm_unreachable("Unknown reduction operation"); 1467 } 1468} 1469 1470void 1471InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1472 //===------------------------------------------------===// 1473 // 1474 // Notice: any optimization or new instruction that go 1475 // into the code below should be also be implemented in 1476 // the cost-model. 1477 // 1478 //===------------------------------------------------===// 1479 Constant *Zero = Builder.getInt32(0); 1480 1481 // In order to support reduction variables we need to be able to vectorize 1482 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1483 // stages. First, we create a new vector PHI node with no incoming edges. 1484 // We use this value when we vectorize all of the instructions that use the 1485 // PHI. Next, after all of the instructions in the block are complete we 1486 // add the new incoming edges to the PHI. At this point all of the 1487 // instructions in the basic block are vectorized, so we can use them to 1488 // construct the PHI. 1489 PhiVector RdxPHIsToFix; 1490 1491 // Scan the loop in a topological order to ensure that defs are vectorized 1492 // before users. 1493 LoopBlocksDFS DFS(OrigLoop); 1494 DFS.perform(LI); 1495 1496 // Vectorize all of the blocks in the original loop. 1497 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 1498 be = DFS.endRPO(); bb != be; ++bb) 1499 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 1500 1501 // At this point every instruction in the original loop is widened to 1502 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1503 // that we vectorized. The PHI nodes are currently empty because we did 1504 // not want to introduce cycles. Notice that the remaining PHI nodes 1505 // that we need to fix are reduction variables. 1506 1507 // Create the 'reduced' values for each of the induction vars. 1508 // The reduced values are the vector values that we scalarize and combine 1509 // after the loop is finished. 1510 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1511 it != e; ++it) { 1512 PHINode *RdxPhi = *it; 1513 assert(RdxPhi && "Unable to recover vectorized PHI"); 1514 1515 // Find the reduction variable descriptor. 1516 assert(Legal->getReductionVars()->count(RdxPhi) && 1517 "Unable to find the reduction variable"); 1518 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1519 (*Legal->getReductionVars())[RdxPhi]; 1520 1521 // We need to generate a reduction vector from the incoming scalar. 1522 // To do so, we need to generate the 'identity' vector and overide 1523 // one of the elements with the incoming scalar reduction. We need 1524 // to do it in the vector-loop preheader. 1525 Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); 1526 1527 // This is the vector-clone of the value that leaves the loop. 1528 VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1529 Type *VecTy = VectorExit[0]->getType(); 1530 1531 // Find the reduction identity variable. Zero for addition, or, xor, 1532 // one for multiplication, -1 for And. 1533 Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); 1534 Constant *Identity = ConstantVector::getSplat(VF, Iden); 1535 1536 // This vector is the Identity vector where the first element is the 1537 // incoming scalar reduction. 1538 Value *VectorStart = Builder.CreateInsertElement(Identity, 1539 RdxDesc.StartValue, Zero); 1540 1541 // Fix the vector-loop phi. 1542 // We created the induction variable so we know that the 1543 // preheader is the first entry. 1544 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1545 1546 // Reductions do not have to start at zero. They can start with 1547 // any loop invariant values. 1548 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 1549 BasicBlock *Latch = OrigLoop->getLoopLatch(); 1550 Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 1551 VectorParts &Val = getVectorValue(LoopVal); 1552 for (unsigned part = 0; part < UF; ++part) { 1553 // Make sure to add the reduction stat value only to the 1554 // first unroll part. 1555 Value *StartVal = (part == 0) ? VectorStart : Identity; 1556 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); 1557 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); 1558 } 1559 1560 // Before each round, move the insertion point right between 1561 // the PHIs and the values we are going to write. 1562 // This allows us to write both PHINodes and the extractelement 1563 // instructions. 1564 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1565 1566 VectorParts RdxParts; 1567 for (unsigned part = 0; part < UF; ++part) { 1568 // This PHINode contains the vectorized reduction variable, or 1569 // the initial value vector, if we bypass the vector loop. 1570 VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); 1571 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1572 Value *StartVal = (part == 0) ? VectorStart : Identity; 1573 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1574 NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); 1575 NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); 1576 RdxParts.push_back(NewPhi); 1577 } 1578 1579 // Reduce all of the unrolled parts into a single vector. 1580 Value *ReducedPartRdx = RdxParts[0]; 1581 for (unsigned part = 1; part < UF; ++part) { 1582 Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); 1583 ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, 1584 "bin.rdx"); 1585 } 1586 1587 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 1588 // and vector ops, reducing the set of values being computed by half each 1589 // round. 1590 assert(isPowerOf2_32(VF) && 1591 "Reduction emission only supported for pow2 vectors!"); 1592 Value *TmpVec = ReducedPartRdx; 1593 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 1594 for (unsigned i = VF; i != 1; i >>= 1) { 1595 // Move the upper half of the vector to the lower half. 1596 for (unsigned j = 0; j != i/2; ++j) 1597 ShuffleMask[j] = Builder.getInt32(i/2 + j); 1598 1599 // Fill the rest of the mask with undef. 1600 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 1601 UndefValue::get(Builder.getInt32Ty())); 1602 1603 Value *Shuf = 1604 Builder.CreateShuffleVector(TmpVec, 1605 UndefValue::get(TmpVec->getType()), 1606 ConstantVector::get(ShuffleMask), 1607 "rdx.shuf"); 1608 1609 Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); 1610 TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); 1611 } 1612 1613 // The result is in the first element of the vector. 1614 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 1615 1616 // Now, we need to fix the users of the reduction variable 1617 // inside and outside of the scalar remainder loop. 1618 // We know that the loop is in LCSSA form. We need to update the 1619 // PHI nodes in the exit blocks. 1620 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1621 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1622 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1623 if (!LCSSAPhi) continue; 1624 1625 // All PHINodes need to have a single entry edge, or two if 1626 // we already fixed them. 1627 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1628 1629 // We found our reduction value exit-PHI. Update it with the 1630 // incoming bypass edge. 1631 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1632 // Add an edge coming from the bypass. 1633 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1634 break; 1635 } 1636 }// end of the LCSSA phi scan. 1637 1638 // Fix the scalar loop reduction variable with the incoming reduction sum 1639 // from the vector body and from the backedge value. 1640 int IncomingEdgeBlockIdx = 1641 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 1642 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 1643 // Pick the other block. 1644 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 1645 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1646 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1647 }// end of for each redux variable. 1648 1649 // The Loop exit block may have single value PHI nodes where the incoming 1650 // value is 'undef'. While vectorizing we only handled real values that 1651 // were defined inside the loop. Here we handle the 'undef case'. 1652 // See PR14725. 1653 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1654 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1655 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1656 if (!LCSSAPhi) continue; 1657 if (LCSSAPhi->getNumIncomingValues() == 1) 1658 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 1659 LoopMiddleBlock); 1660 } 1661} 1662 1663InnerLoopVectorizer::VectorParts 1664InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 1665 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 1666 "Invalid edge"); 1667 1668 VectorParts SrcMask = createBlockInMask(Src); 1669 1670 // The terminator has to be a branch inst! 1671 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 1672 assert(BI && "Unexpected terminator found"); 1673 1674 if (BI->isConditional()) { 1675 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 1676 1677 if (BI->getSuccessor(0) != Dst) 1678 for (unsigned part = 0; part < UF; ++part) 1679 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 1680 1681 for (unsigned part = 0; part < UF; ++part) 1682 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 1683 return EdgeMask; 1684 } 1685 1686 return SrcMask; 1687} 1688 1689InnerLoopVectorizer::VectorParts 1690InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 1691 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 1692 1693 // Loop incoming mask is all-one. 1694 if (OrigLoop->getHeader() == BB) { 1695 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 1696 return getVectorValue(C); 1697 } 1698 1699 // This is the block mask. We OR all incoming edges, and with zero. 1700 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 1701 VectorParts BlockMask = getVectorValue(Zero); 1702 1703 // For each pred: 1704 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 1705 VectorParts EM = createEdgeMask(*it, BB); 1706 for (unsigned part = 0; part < UF; ++part) 1707 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 1708 } 1709 1710 return BlockMask; 1711} 1712 1713void 1714InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 1715 BasicBlock *BB, PhiVector *PV) { 1716 // For each instruction in the old loop. 1717 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1718 VectorParts &Entry = WidenMap.get(it); 1719 switch (it->getOpcode()) { 1720 case Instruction::Br: 1721 // Nothing to do for PHIs and BR, since we already took care of the 1722 // loop control flow instructions. 1723 continue; 1724 case Instruction::PHI:{ 1725 PHINode* P = cast<PHINode>(it); 1726 // Handle reduction variables: 1727 if (Legal->getReductionVars()->count(P)) { 1728 for (unsigned part = 0; part < UF; ++part) { 1729 // This is phase one of vectorizing PHIs. 1730 Type *VecTy = VectorType::get(it->getType(), VF); 1731 Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 1732 LoopVectorBody-> getFirstInsertionPt()); 1733 } 1734 PV->push_back(P); 1735 continue; 1736 } 1737 1738 // Check for PHI nodes that are lowered to vector selects. 1739 if (P->getParent() != OrigLoop->getHeader()) { 1740 // We know that all PHIs in non header blocks are converted into 1741 // selects, so we don't have to worry about the insertion order and we 1742 // can just use the builder. 1743 1744 // At this point we generate the predication tree. There may be 1745 // duplications since this is a simple recursive scan, but future 1746 // optimizations will clean it up. 1747 VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), 1748 P->getParent()); 1749 1750 for (unsigned part = 0; part < UF; ++part) { 1751 VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); 1752 VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); 1753 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], 1754 "predphi"); 1755 } 1756 continue; 1757 } 1758 1759 // This PHINode must be an induction variable. 1760 // Make sure that we know about it. 1761 assert(Legal->getInductionVars()->count(P) && 1762 "Not an induction variable"); 1763 1764 LoopVectorizationLegality::InductionInfo II = 1765 Legal->getInductionVars()->lookup(P); 1766 1767 switch (II.IK) { 1768 case LoopVectorizationLegality::IK_NoInduction: 1769 llvm_unreachable("Unknown induction"); 1770 case LoopVectorizationLegality::IK_IntInduction: { 1771 assert(P == OldInduction && "Unexpected PHI"); 1772 Value *Broadcasted = getBroadcastInstrs(Induction); 1773 // After broadcasting the induction variable we need to make the 1774 // vector consecutive by adding 0, 1, 2 ... 1775 for (unsigned part = 0; part < UF; ++part) 1776 Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); 1777 continue; 1778 } 1779 case LoopVectorizationLegality::IK_ReverseIntInduction: 1780 case LoopVectorizationLegality::IK_PtrInduction: 1781 case LoopVectorizationLegality::IK_ReversePtrInduction: 1782 // Handle reverse integer and pointer inductions. 1783 Value *StartIdx = 0; 1784 // If we have a single integer induction variable then use it. 1785 // Otherwise, start counting at zero. 1786 if (OldInduction) { 1787 LoopVectorizationLegality::InductionInfo OldII = 1788 Legal->getInductionVars()->lookup(OldInduction); 1789 StartIdx = OldII.StartValue; 1790 } else { 1791 StartIdx = ConstantInt::get(Induction->getType(), 0); 1792 } 1793 // This is the normalized GEP that starts counting at zero. 1794 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1795 "normalized.idx"); 1796 1797 // Handle the reverse integer induction variable case. 1798 if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { 1799 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 1800 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 1801 "resize.norm.idx"); 1802 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 1803 "reverse.idx"); 1804 1805 // This is a new value so do not hoist it out. 1806 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 1807 // After broadcasting the induction variable we need to make the 1808 // vector consecutive by adding ... -3, -2, -1, 0. 1809 for (unsigned part = 0; part < UF; ++part) 1810 Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); 1811 continue; 1812 } 1813 1814 // Handle the pointer induction variable case. 1815 assert(P->getType()->isPointerTy() && "Unexpected type."); 1816 1817 // Is this a reverse induction ptr or a consecutive induction ptr. 1818 bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == 1819 II.IK); 1820 1821 // This is the vector of results. Notice that we don't generate 1822 // vector geps because scalar geps result in better code. 1823 for (unsigned part = 0; part < UF; ++part) { 1824 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1825 for (unsigned int i = 0; i < VF; ++i) { 1826 int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); 1827 Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); 1828 Value *GlobalIdx; 1829 if (!Reverse) 1830 GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1831 else 1832 GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); 1833 1834 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 1835 "next.gep"); 1836 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1837 Builder.getInt32(i), 1838 "insert.gep"); 1839 } 1840 Entry[part] = VecVal; 1841 } 1842 continue; 1843 } 1844 1845 }// End of PHI. 1846 1847 case Instruction::Add: 1848 case Instruction::FAdd: 1849 case Instruction::Sub: 1850 case Instruction::FSub: 1851 case Instruction::Mul: 1852 case Instruction::FMul: 1853 case Instruction::UDiv: 1854 case Instruction::SDiv: 1855 case Instruction::FDiv: 1856 case Instruction::URem: 1857 case Instruction::SRem: 1858 case Instruction::FRem: 1859 case Instruction::Shl: 1860 case Instruction::LShr: 1861 case Instruction::AShr: 1862 case Instruction::And: 1863 case Instruction::Or: 1864 case Instruction::Xor: { 1865 // Just widen binops. 1866 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 1867 VectorParts &A = getVectorValue(it->getOperand(0)); 1868 VectorParts &B = getVectorValue(it->getOperand(1)); 1869 1870 // Use this vector value for all users of the original instruction. 1871 for (unsigned Part = 0; Part < UF; ++Part) { 1872 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 1873 1874 // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. 1875 BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V); 1876 if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) { 1877 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1878 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1879 } 1880 if (VecOp && isa<PossiblyExactOperator>(VecOp)) 1881 VecOp->setIsExact(BinOp->isExact()); 1882 1883 Entry[Part] = V; 1884 } 1885 break; 1886 } 1887 case Instruction::Select: { 1888 // Widen selects. 1889 // If the selector is loop invariant we can create a select 1890 // instruction with a scalar condition. Otherwise, use vector-select. 1891 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 1892 OrigLoop); 1893 1894 // The condition can be loop invariant but still defined inside the 1895 // loop. This means that we can't just use the original 'cond' value. 1896 // We have to take the 'vectorized' value and pick the first lane. 1897 // Instcombine will make this a no-op. 1898 VectorParts &Cond = getVectorValue(it->getOperand(0)); 1899 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 1900 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 1901 Value *ScalarCond = Builder.CreateExtractElement(Cond[0], 1902 Builder.getInt32(0)); 1903 for (unsigned Part = 0; Part < UF; ++Part) { 1904 Entry[Part] = Builder.CreateSelect( 1905 InvariantCond ? ScalarCond : Cond[Part], 1906 Op0[Part], 1907 Op1[Part]); 1908 } 1909 break; 1910 } 1911 1912 case Instruction::ICmp: 1913 case Instruction::FCmp: { 1914 // Widen compares. Generate vector compares. 1915 bool FCmp = (it->getOpcode() == Instruction::FCmp); 1916 CmpInst *Cmp = dyn_cast<CmpInst>(it); 1917 VectorParts &A = getVectorValue(it->getOperand(0)); 1918 VectorParts &B = getVectorValue(it->getOperand(1)); 1919 for (unsigned Part = 0; Part < UF; ++Part) { 1920 Value *C = 0; 1921 if (FCmp) 1922 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 1923 else 1924 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 1925 Entry[Part] = C; 1926 } 1927 break; 1928 } 1929 1930 case Instruction::Store: 1931 case Instruction::Load: 1932 vectorizeMemoryInstruction(it, Legal); 1933 break; 1934 case Instruction::ZExt: 1935 case Instruction::SExt: 1936 case Instruction::FPToUI: 1937 case Instruction::FPToSI: 1938 case Instruction::FPExt: 1939 case Instruction::PtrToInt: 1940 case Instruction::IntToPtr: 1941 case Instruction::SIToFP: 1942 case Instruction::UIToFP: 1943 case Instruction::Trunc: 1944 case Instruction::FPTrunc: 1945 case Instruction::BitCast: { 1946 CastInst *CI = dyn_cast<CastInst>(it); 1947 /// Optimize the special case where the source is the induction 1948 /// variable. Notice that we can only optimize the 'trunc' case 1949 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 1950 /// c. other casts depend on pointer size. 1951 if (CI->getOperand(0) == OldInduction && 1952 it->getOpcode() == Instruction::Trunc) { 1953 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 1954 CI->getType()); 1955 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 1956 for (unsigned Part = 0; Part < UF; ++Part) 1957 Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); 1958 break; 1959 } 1960 /// Vectorize casts. 1961 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1962 1963 VectorParts &A = getVectorValue(it->getOperand(0)); 1964 for (unsigned Part = 0; Part < UF; ++Part) 1965 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 1966 break; 1967 } 1968 1969 case Instruction::Call: { 1970 assert(isTriviallyVectorizableIntrinsic(it)); 1971 Module *M = BB->getParent()->getParent(); 1972 IntrinsicInst *II = cast<IntrinsicInst>(it); 1973 Intrinsic::ID ID = II->getIntrinsicID(); 1974 for (unsigned Part = 0; Part < UF; ++Part) { 1975 SmallVector<Value*, 4> Args; 1976 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) { 1977 VectorParts &Arg = getVectorValue(II->getArgOperand(i)); 1978 Args.push_back(Arg[Part]); 1979 } 1980 Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; 1981 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 1982 Entry[Part] = Builder.CreateCall(F, Args); 1983 } 1984 break; 1985 } 1986 1987 default: 1988 // All other instructions are unsupported. Scalarize them. 1989 scalarizeInstruction(it); 1990 break; 1991 }// end of switch. 1992 }// end of for_each instr. 1993} 1994 1995void InnerLoopVectorizer::updateAnalysis() { 1996 // Forget the original basic block. 1997 SE->forgetLoop(OrigLoop); 1998 1999 // Update the dominator tree information. 2000 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 2001 "Entry does not dominate exit."); 2002 2003 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2004 DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 2005 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 2006 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 2007 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); 2008 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 2009 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 2010 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2011 2012 DEBUG(DT->verifyAnalysis()); 2013} 2014 2015bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 2016 if (!EnableIfConversion) 2017 return false; 2018 2019 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 2020 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 2021 2022 // Collect the blocks that need predication. 2023 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 2024 BasicBlock *BB = LoopBlocks[i]; 2025 2026 // We don't support switch statements inside loops. 2027 if (!isa<BranchInst>(BB->getTerminator())) 2028 return false; 2029 2030 // We must have at most two predecessors because we need to convert 2031 // all PHIs to selects. 2032 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 2033 if (Preds > 2) 2034 return false; 2035 2036 // We must be able to predicate all blocks that need to be predicated. 2037 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 2038 return false; 2039 } 2040 2041 // We can if-convert this loop. 2042 return true; 2043} 2044 2045bool LoopVectorizationLegality::canVectorize() { 2046 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 2047 2048 // We can only vectorize innermost loops. 2049 if (TheLoop->getSubLoopsVector().size()) 2050 return false; 2051 2052 // We must have a single backedge. 2053 if (TheLoop->getNumBackEdges() != 1) 2054 return false; 2055 2056 // We must have a single exiting block. 2057 if (!TheLoop->getExitingBlock()) 2058 return false; 2059 2060 unsigned NumBlocks = TheLoop->getNumBlocks(); 2061 2062 // Check if we can if-convert non single-bb loops. 2063 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 2064 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 2065 return false; 2066 } 2067 2068 // We need to have a loop header. 2069 BasicBlock *Latch = TheLoop->getLoopLatch(); 2070 DEBUG(dbgs() << "LV: Found a loop: " << 2071 TheLoop->getHeader()->getName() << "\n"); 2072 2073 // ScalarEvolution needs to be able to find the exit count. 2074 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 2075 if (ExitCount == SE->getCouldNotCompute()) { 2076 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 2077 return false; 2078 } 2079 2080 // Do not loop-vectorize loops with a tiny trip count. 2081 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 2082 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 2083 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 2084 "This loop is not worth vectorizing.\n"); 2085 return false; 2086 } 2087 2088 // Check if we can vectorize the instructions and CFG in this loop. 2089 if (!canVectorizeInstrs()) { 2090 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 2091 return false; 2092 } 2093 2094 // Go over each instruction and look at memory deps. 2095 if (!canVectorizeMemory()) { 2096 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 2097 return false; 2098 } 2099 2100 // Collect all of the variables that remain uniform after vectorization. 2101 collectLoopUniforms(); 2102 2103 DEBUG(dbgs() << "LV: We can vectorize this loop" << 2104 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 2105 <<"!\n"); 2106 2107 // Okay! We can vectorize. At this point we don't have any other mem analysis 2108 // which may limit our maximum vectorization factor, so just return true with 2109 // no restrictions. 2110 return true; 2111} 2112 2113bool LoopVectorizationLegality::canVectorizeInstrs() { 2114 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 2115 BasicBlock *Header = TheLoop->getHeader(); 2116 2117 // For each block in the loop. 2118 for (Loop::block_iterator bb = TheLoop->block_begin(), 2119 be = TheLoop->block_end(); bb != be; ++bb) { 2120 2121 // Scan the instructions in the block and look for hazards. 2122 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2123 ++it) { 2124 2125 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 2126 // This should not happen because the loop should be normalized. 2127 if (Phi->getNumIncomingValues() != 2) { 2128 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 2129 return false; 2130 } 2131 2132 // Check that this PHI type is allowed. 2133 if (!Phi->getType()->isIntegerTy() && 2134 !Phi->getType()->isFloatingPointTy() && 2135 !Phi->getType()->isPointerTy()) { 2136 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 2137 return false; 2138 } 2139 2140 // If this PHINode is not in the header block, then we know that we 2141 // can convert it to select during if-conversion. No need to check if 2142 // the PHIs in this block are induction or reduction variables. 2143 if (*bb != Header) 2144 continue; 2145 2146 // This is the value coming from the preheader. 2147 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 2148 // Check if this is an induction variable. 2149 InductionKind IK = isInductionVariable(Phi); 2150 2151 if (IK_NoInduction != IK) { 2152 // Int inductions are special because we only allow one IV. 2153 if (IK == IK_IntInduction) { 2154 if (Induction) { 2155 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 2156 return false; 2157 } 2158 Induction = Phi; 2159 } 2160 2161 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 2162 Inductions[Phi] = InductionInfo(StartValue, IK); 2163 continue; 2164 } 2165 2166 if (AddReductionVar(Phi, RK_IntegerAdd)) { 2167 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 2168 continue; 2169 } 2170 if (AddReductionVar(Phi, RK_IntegerMult)) { 2171 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 2172 continue; 2173 } 2174 if (AddReductionVar(Phi, RK_IntegerOr)) { 2175 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 2176 continue; 2177 } 2178 if (AddReductionVar(Phi, RK_IntegerAnd)) { 2179 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 2180 continue; 2181 } 2182 if (AddReductionVar(Phi, RK_IntegerXor)) { 2183 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 2184 continue; 2185 } 2186 if (AddReductionVar(Phi, RK_FloatMult)) { 2187 DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); 2188 continue; 2189 } 2190 if (AddReductionVar(Phi, RK_FloatAdd)) { 2191 DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); 2192 continue; 2193 } 2194 2195 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 2196 return false; 2197 }// end of PHI handling 2198 2199 // We still don't handle functions. 2200 CallInst *CI = dyn_cast<CallInst>(it); 2201 if (CI && !isTriviallyVectorizableIntrinsic(it)) { 2202 DEBUG(dbgs() << "LV: Found a call site.\n"); 2203 return false; 2204 } 2205 2206 // Check that the instruction return type is vectorizable. 2207 if (!VectorType::isValidElementType(it->getType()) && 2208 !it->getType()->isVoidTy()) { 2209 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 2210 return false; 2211 } 2212 2213 // Check that the stored type is vectorizable. 2214 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 2215 Type *T = ST->getValueOperand()->getType(); 2216 if (!VectorType::isValidElementType(T)) 2217 return false; 2218 } 2219 2220 // Reduction instructions are allowed to have exit users. 2221 // All other instructions must not have external users. 2222 if (!AllowedExit.count(it)) 2223 //Check that all of the users of the loop are inside the BB. 2224 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 2225 I != E; ++I) { 2226 Instruction *U = cast<Instruction>(*I); 2227 // This user may be a reduction exit value. 2228 if (!TheLoop->contains(U)) { 2229 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 2230 return false; 2231 } 2232 } 2233 } // next instr. 2234 2235 } 2236 2237 if (!Induction) { 2238 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 2239 assert(getInductionVars()->size() && "No induction variables"); 2240 } 2241 2242 return true; 2243} 2244 2245void LoopVectorizationLegality::collectLoopUniforms() { 2246 // We now know that the loop is vectorizable! 2247 // Collect variables that will remain uniform after vectorization. 2248 std::vector<Value*> Worklist; 2249 BasicBlock *Latch = TheLoop->getLoopLatch(); 2250 2251 // Start with the conditional branch and walk up the block. 2252 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 2253 2254 while (Worklist.size()) { 2255 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 2256 Worklist.pop_back(); 2257 2258 // Look at instructions inside this loop. 2259 // Stop when reaching PHI nodes. 2260 // TODO: we need to follow values all over the loop, not only in this block. 2261 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 2262 continue; 2263 2264 // This is a known uniform. 2265 Uniforms.insert(I); 2266 2267 // Insert all operands. 2268 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 2269 Worklist.push_back(I->getOperand(i)); 2270 } 2271 } 2272} 2273 2274bool LoopVectorizationLegality::canVectorizeMemory() { 2275 typedef SmallVector<Value*, 16> ValueVector; 2276 typedef SmallPtrSet<Value*, 16> ValueSet; 2277 // Holds the Load and Store *instructions*. 2278 ValueVector Loads; 2279 ValueVector Stores; 2280 PtrRtCheck.Pointers.clear(); 2281 PtrRtCheck.Need = false; 2282 2283 // For each block. 2284 for (Loop::block_iterator bb = TheLoop->block_begin(), 2285 be = TheLoop->block_end(); bb != be; ++bb) { 2286 2287 // Scan the BB and collect legal loads and stores. 2288 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2289 ++it) { 2290 2291 // If this is a load, save it. If this instruction can read from memory 2292 // but is not a load, then we quit. Notice that we don't handle function 2293 // calls that read or write. 2294 if (it->mayReadFromMemory()) { 2295 LoadInst *Ld = dyn_cast<LoadInst>(it); 2296 if (!Ld) return false; 2297 if (!Ld->isSimple()) { 2298 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 2299 return false; 2300 } 2301 Loads.push_back(Ld); 2302 continue; 2303 } 2304 2305 // Save 'store' instructions. Abort if other instructions write to memory. 2306 if (it->mayWriteToMemory()) { 2307 StoreInst *St = dyn_cast<StoreInst>(it); 2308 if (!St) return false; 2309 if (!St->isSimple()) { 2310 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 2311 return false; 2312 } 2313 Stores.push_back(St); 2314 } 2315 } // next instr. 2316 } // next block. 2317 2318 // Now we have two lists that hold the loads and the stores. 2319 // Next, we find the pointers that they use. 2320 2321 // Check if we see any stores. If there are no stores, then we don't 2322 // care if the pointers are *restrict*. 2323 if (!Stores.size()) { 2324 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 2325 return true; 2326 } 2327 2328 // Holds the read and read-write *pointers* that we find. 2329 ValueVector Reads; 2330 ValueVector ReadWrites; 2331 2332 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 2333 // multiple times on the same object. If the ptr is accessed twice, once 2334 // for read and once for write, it will only appear once (on the write 2335 // list). This is okay, since we are going to check for conflicts between 2336 // writes and between reads and writes, but not between reads and reads. 2337 ValueSet Seen; 2338 2339 ValueVector::iterator I, IE; 2340 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 2341 StoreInst *ST = cast<StoreInst>(*I); 2342 Value* Ptr = ST->getPointerOperand(); 2343 2344 if (isUniform(Ptr)) { 2345 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 2346 return false; 2347 } 2348 2349 // If we did *not* see this pointer before, insert it to 2350 // the read-write list. At this phase it is only a 'write' list. 2351 if (Seen.insert(Ptr)) 2352 ReadWrites.push_back(Ptr); 2353 } 2354 2355 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 2356 LoadInst *LD = cast<LoadInst>(*I); 2357 Value* Ptr = LD->getPointerOperand(); 2358 // If we did *not* see this pointer before, insert it to the 2359 // read list. If we *did* see it before, then it is already in 2360 // the read-write list. This allows us to vectorize expressions 2361 // such as A[i] += x; Because the address of A[i] is a read-write 2362 // pointer. This only works if the index of A[i] is consecutive. 2363 // If the address of i is unknown (for example A[B[i]]) then we may 2364 // read a few words, modify, and write a few words, and some of the 2365 // words may be written to the same address. 2366 if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) 2367 Reads.push_back(Ptr); 2368 } 2369 2370 // If we write (or read-write) to a single destination and there are no 2371 // other reads in this loop then is it safe to vectorize. 2372 if (ReadWrites.size() == 1 && Reads.size() == 0) { 2373 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 2374 return true; 2375 } 2376 2377 // Find pointers with computable bounds. We are going to use this information 2378 // to place a runtime bound check. 2379 bool CanDoRT = true; 2380 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 2381 if (hasComputableBounds(*I)) { 2382 PtrRtCheck.insert(SE, TheLoop, *I); 2383 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 2384 } else { 2385 CanDoRT = false; 2386 break; 2387 } 2388 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 2389 if (hasComputableBounds(*I)) { 2390 PtrRtCheck.insert(SE, TheLoop, *I); 2391 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 2392 } else { 2393 CanDoRT = false; 2394 break; 2395 } 2396 2397 // Check that we did not collect too many pointers or found a 2398 // unsizeable pointer. 2399 if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 2400 PtrRtCheck.reset(); 2401 CanDoRT = false; 2402 } 2403 2404 if (CanDoRT) { 2405 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 2406 } 2407 2408 bool NeedRTCheck = false; 2409 2410 // Now that the pointers are in two lists (Reads and ReadWrites), we 2411 // can check that there are no conflicts between each of the writes and 2412 // between the writes to the reads. 2413 ValueSet WriteObjects; 2414 ValueVector TempObjects; 2415 2416 // Check that the read-writes do not conflict with other read-write 2417 // pointers. 2418 bool AllWritesIdentified = true; 2419 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 2420 GetUnderlyingObjects(*I, TempObjects, DL); 2421 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 2422 it != e; ++it) { 2423 if (!isIdentifiedObject(*it)) { 2424 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 2425 NeedRTCheck = true; 2426 AllWritesIdentified = false; 2427 } 2428 if (!WriteObjects.insert(*it)) { 2429 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 2430 << **it <<"\n"); 2431 return false; 2432 } 2433 } 2434 TempObjects.clear(); 2435 } 2436 2437 /// Check that the reads don't conflict with the read-writes. 2438 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 2439 GetUnderlyingObjects(*I, TempObjects, DL); 2440 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 2441 it != e; ++it) { 2442 // If all of the writes are identified then we don't care if the read 2443 // pointer is identified or not. 2444 if (!AllWritesIdentified && !isIdentifiedObject(*it)) { 2445 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 2446 NeedRTCheck = true; 2447 } 2448 if (WriteObjects.count(*it)) { 2449 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 2450 << **it <<"\n"); 2451 return false; 2452 } 2453 } 2454 TempObjects.clear(); 2455 } 2456 2457 PtrRtCheck.Need = NeedRTCheck; 2458 if (NeedRTCheck && !CanDoRT) { 2459 DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 2460 "the array bounds.\n"); 2461 PtrRtCheck.reset(); 2462 return false; 2463 } 2464 2465 DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << 2466 " need a runtime memory check.\n"); 2467 return true; 2468} 2469 2470bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 2471 ReductionKind Kind) { 2472 if (Phi->getNumIncomingValues() != 2) 2473 return false; 2474 2475 // Reduction variables are only found in the loop header block. 2476 if (Phi->getParent() != TheLoop->getHeader()) 2477 return false; 2478 2479 // Obtain the reduction start value from the value that comes from the loop 2480 // preheader. 2481 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 2482 2483 // ExitInstruction is the single value which is used outside the loop. 2484 // We only allow for a single reduction value to be used outside the loop. 2485 // This includes users of the reduction, variables (which form a cycle 2486 // which ends in the phi node). 2487 Instruction *ExitInstruction = 0; 2488 // Indicates that we found a binary operation in our scan. 2489 bool FoundBinOp = false; 2490 2491 // Iter is our iterator. We start with the PHI node and scan for all of the 2492 // users of this instruction. All users must be instructions that can be 2493 // used as reduction variables (such as ADD). We may have a single 2494 // out-of-block user. The cycle must end with the original PHI. 2495 Instruction *Iter = Phi; 2496 while (true) { 2497 // If the instruction has no users then this is a broken 2498 // chain and can't be a reduction variable. 2499 if (Iter->use_empty()) 2500 return false; 2501 2502 // Did we find a user inside this loop already ? 2503 bool FoundInBlockUser = false; 2504 // Did we reach the initial PHI node already ? 2505 bool FoundStartPHI = false; 2506 2507 // Is this a bin op ? 2508 FoundBinOp |= !isa<PHINode>(Iter); 2509 2510 // For each of the *users* of iter. 2511 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 2512 it != e; ++it) { 2513 Instruction *U = cast<Instruction>(*it); 2514 // We already know that the PHI is a user. 2515 if (U == Phi) { 2516 FoundStartPHI = true; 2517 continue; 2518 } 2519 2520 // Check if we found the exit user. 2521 BasicBlock *Parent = U->getParent(); 2522 if (!TheLoop->contains(Parent)) { 2523 // Exit if you find multiple outside users. 2524 if (ExitInstruction != 0) 2525 return false; 2526 ExitInstruction = Iter; 2527 } 2528 2529 // We allow in-loop PHINodes which are not the original reduction PHI 2530 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 2531 // structure) then don't skip this PHI. 2532 if (isa<PHINode>(Iter) && isa<PHINode>(U) && 2533 U->getParent() != TheLoop->getHeader() && 2534 TheLoop->contains(U) && 2535 Iter->getNumUses() > 1) 2536 continue; 2537 2538 // We can't have multiple inside users. 2539 if (FoundInBlockUser) 2540 return false; 2541 FoundInBlockUser = true; 2542 2543 // Any reduction instr must be of one of the allowed kinds. 2544 if (!isReductionInstr(U, Kind)) 2545 return false; 2546 2547 // Reductions of instructions such as Div, and Sub is only 2548 // possible if the LHS is the reduction variable. 2549 if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter) 2550 return false; 2551 2552 Iter = U; 2553 } 2554 2555 // We found a reduction var if we have reached the original 2556 // phi node and we only have a single instruction with out-of-loop 2557 // users. 2558 if (FoundStartPHI) { 2559 // This instruction is allowed to have out-of-loop users. 2560 AllowedExit.insert(ExitInstruction); 2561 2562 // Save the description of this reduction variable. 2563 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 2564 Reductions[Phi] = RD; 2565 // We've ended the cycle. This is a reduction variable if we have an 2566 // outside user and it has a binary op. 2567 return FoundBinOp && ExitInstruction; 2568 } 2569 } 2570} 2571 2572bool 2573LoopVectorizationLegality::isReductionInstr(Instruction *I, 2574 ReductionKind Kind) { 2575 bool FP = I->getType()->isFloatingPointTy(); 2576 bool FastMath = (FP && I->isCommutative() && I->isAssociative()); 2577 2578 switch (I->getOpcode()) { 2579 default: 2580 return false; 2581 case Instruction::PHI: 2582 if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) 2583 return false; 2584 // possibly. 2585 return true; 2586 case Instruction::Sub: 2587 case Instruction::Add: 2588 return Kind == RK_IntegerAdd; 2589 case Instruction::SDiv: 2590 case Instruction::UDiv: 2591 case Instruction::Mul: 2592 return Kind == RK_IntegerMult; 2593 case Instruction::And: 2594 return Kind == RK_IntegerAnd; 2595 case Instruction::Or: 2596 return Kind == RK_IntegerOr; 2597 case Instruction::Xor: 2598 return Kind == RK_IntegerXor; 2599 case Instruction::FMul: 2600 return Kind == RK_FloatMult && FastMath; 2601 case Instruction::FAdd: 2602 return Kind == RK_FloatAdd && FastMath; 2603 } 2604} 2605 2606LoopVectorizationLegality::InductionKind 2607LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 2608 Type *PhiTy = Phi->getType(); 2609 // We only handle integer and pointer inductions variables. 2610 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 2611 return IK_NoInduction; 2612 2613 // Check that the PHI is consecutive. 2614 const SCEV *PhiScev = SE->getSCEV(Phi); 2615 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2616 if (!AR) { 2617 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 2618 return IK_NoInduction; 2619 } 2620 const SCEV *Step = AR->getStepRecurrence(*SE); 2621 2622 // Integer inductions need to have a stride of one. 2623 if (PhiTy->isIntegerTy()) { 2624 if (Step->isOne()) 2625 return IK_IntInduction; 2626 if (Step->isAllOnesValue()) 2627 return IK_ReverseIntInduction; 2628 return IK_NoInduction; 2629 } 2630 2631 // Calculate the pointer stride and check if it is consecutive. 2632 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 2633 if (!C) 2634 return IK_NoInduction; 2635 2636 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 2637 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 2638 if (C->getValue()->equalsInt(Size)) 2639 return IK_PtrInduction; 2640 else if (C->getValue()->equalsInt(0 - Size)) 2641 return IK_ReversePtrInduction; 2642 2643 return IK_NoInduction; 2644} 2645 2646bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 2647 Value *In0 = const_cast<Value*>(V); 2648 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 2649 if (!PN) 2650 return false; 2651 2652 return Inductions.count(PN); 2653} 2654 2655bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 2656 assert(TheLoop->contains(BB) && "Unknown block used"); 2657 2658 // Blocks that do not dominate the latch need predication. 2659 BasicBlock* Latch = TheLoop->getLoopLatch(); 2660 return !DT->dominates(BB, Latch); 2661} 2662 2663bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 2664 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2665 // We don't predicate loads/stores at the moment. 2666 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 2667 return false; 2668 2669 // The instructions below can trap. 2670 switch (it->getOpcode()) { 2671 default: continue; 2672 case Instruction::UDiv: 2673 case Instruction::SDiv: 2674 case Instruction::URem: 2675 case Instruction::SRem: 2676 return false; 2677 } 2678 } 2679 2680 return true; 2681} 2682 2683bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 2684 const SCEV *PhiScev = SE->getSCEV(Ptr); 2685 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2686 if (!AR) 2687 return false; 2688 2689 return AR->isAffine(); 2690} 2691 2692LoopVectorizationCostModel::VectorizationFactor 2693LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 2694 unsigned UserVF) { 2695 // Width 1 means no vectorize 2696 VectorizationFactor Factor = { 1U, 0U }; 2697 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 2698 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 2699 return Factor; 2700 } 2701 2702 // Find the trip count. 2703 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 2704 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 2705 2706 unsigned WidestType = getWidestType(); 2707 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 2708 unsigned MaxVectorSize = WidestRegister / WidestType; 2709 DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); 2710 DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); 2711 2712 if (MaxVectorSize == 0) { 2713 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 2714 MaxVectorSize = 1; 2715 } 2716 2717 assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" 2718 " into one vector!"); 2719 2720 unsigned VF = MaxVectorSize; 2721 2722 // If we optimize the program for size, avoid creating the tail loop. 2723 if (OptForSize) { 2724 // If we are unable to calculate the trip count then don't try to vectorize. 2725 if (TC < 2) { 2726 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 2727 return Factor; 2728 } 2729 2730 // Find the maximum SIMD width that can fit within the trip count. 2731 VF = TC % MaxVectorSize; 2732 2733 if (VF == 0) 2734 VF = MaxVectorSize; 2735 2736 // If the trip count that we found modulo the vectorization factor is not 2737 // zero then we require a tail. 2738 if (VF < 2) { 2739 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 2740 return Factor; 2741 } 2742 } 2743 2744 if (UserVF != 0) { 2745 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 2746 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 2747 2748 Factor.Width = UserVF; 2749 return Factor; 2750 } 2751 2752 float Cost = expectedCost(1); 2753 unsigned Width = 1; 2754 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 2755 for (unsigned i=2; i <= VF; i*=2) { 2756 // Notice that the vector loop needs to be executed less times, so 2757 // we need to divide the cost of the vector loops by the width of 2758 // the vector elements. 2759 float VectorCost = expectedCost(i) / (float)i; 2760 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 2761 (int)VectorCost << ".\n"); 2762 if (VectorCost < Cost) { 2763 Cost = VectorCost; 2764 Width = i; 2765 } 2766 } 2767 2768 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 2769 Factor.Width = Width; 2770 Factor.Cost = Width * Cost; 2771 return Factor; 2772} 2773 2774unsigned LoopVectorizationCostModel::getWidestType() { 2775 unsigned MaxWidth = 8; 2776 2777 // For each block. 2778 for (Loop::block_iterator bb = TheLoop->block_begin(), 2779 be = TheLoop->block_end(); bb != be; ++bb) { 2780 BasicBlock *BB = *bb; 2781 2782 // For each instruction in the loop. 2783 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2784 Type *T = it->getType(); 2785 2786 // Only examine Loads, Stores and PHINodes. 2787 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 2788 continue; 2789 2790 // Examine PHI nodes that are reduction variables. 2791 if (PHINode *PN = dyn_cast<PHINode>(it)) 2792 if (!Legal->getReductionVars()->count(PN)) 2793 continue; 2794 2795 // Examine the stored values. 2796 StoreInst *ST = 0; 2797 if ((ST = dyn_cast<StoreInst>(it))) 2798 T = ST->getValueOperand()->getType(); 2799 2800 // Ignore loaded pointer types and stored pointer types that are not 2801 // consecutive. However, we do want to take consecutive stores/loads of 2802 // pointer vectors into account. 2803 if (T->isPointerTy() && isConsecutiveLoadOrStore(it)) 2804 MaxWidth = std::max(MaxWidth, DL->getPointerSizeInBits()); 2805 else 2806 MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits()); 2807 } 2808 } 2809 2810 return MaxWidth; 2811} 2812 2813unsigned 2814LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, 2815 unsigned UserUF, 2816 unsigned VF, 2817 unsigned LoopCost) { 2818 2819 // -- The unroll heuristics -- 2820 // We unroll the loop in order to expose ILP and reduce the loop overhead. 2821 // There are many micro-architectural considerations that we can't predict 2822 // at this level. For example frontend pressure (on decode or fetch) due to 2823 // code size, or the number and capabilities of the execution ports. 2824 // 2825 // We use the following heuristics to select the unroll factor: 2826 // 1. If the code has reductions the we unroll in order to break the cross 2827 // iteration dependency. 2828 // 2. If the loop is really small then we unroll in order to reduce the loop 2829 // overhead. 2830 // 3. We don't unroll if we think that we will spill registers to memory due 2831 // to the increased register pressure. 2832 2833 // Use the user preference, unless 'auto' is selected. 2834 if (UserUF != 0) 2835 return UserUF; 2836 2837 // When we optimize for size we don't unroll. 2838 if (OptForSize) 2839 return 1; 2840 2841 // Do not unroll loops with a relatively small trip count. 2842 unsigned TC = SE->getSmallConstantTripCount(TheLoop, 2843 TheLoop->getLoopLatch()); 2844 if (TC > 1 && TC < TinyTripCountUnrollThreshold) 2845 return 1; 2846 2847 unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); 2848 DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << 2849 " vector registers\n"); 2850 2851 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 2852 // We divide by these constants so assume that we have at least one 2853 // instruction that uses at least one register. 2854 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 2855 R.NumInstructions = std::max(R.NumInstructions, 1U); 2856 2857 // We calculate the unroll factor using the following formula. 2858 // Subtract the number of loop invariants from the number of available 2859 // registers. These registers are used by all of the unrolled instances. 2860 // Next, divide the remaining registers by the number of registers that is 2861 // required by the loop, in order to estimate how many parallel instances 2862 // fit without causing spills. 2863 unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; 2864 2865 // Clamp the unroll factor ranges to reasonable factors. 2866 unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); 2867 2868 // If we did not calculate the cost for VF (because the user selected the VF) 2869 // then we calculate the cost of VF here. 2870 if (LoopCost == 0) 2871 LoopCost = expectedCost(VF); 2872 2873 // Clamp the calculated UF to be between the 1 and the max unroll factor 2874 // that the target allows. 2875 if (UF > MaxUnrollSize) 2876 UF = MaxUnrollSize; 2877 else if (UF < 1) 2878 UF = 1; 2879 2880 if (Legal->getReductionVars()->size()) { 2881 DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); 2882 return UF; 2883 } 2884 2885 // We want to unroll tiny loops in order to reduce the loop overhead. 2886 // We assume that the cost overhead is 1 and we use the cost model 2887 // to estimate the cost of the loop and unroll until the cost of the 2888 // loop overhead is about 5% of the cost of the loop. 2889 DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); 2890 if (LoopCost < 20) { 2891 DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); 2892 unsigned NewUF = 20/LoopCost + 1; 2893 return std::min(NewUF, UF); 2894 } 2895 2896 DEBUG(dbgs() << "LV: Not Unrolling. \n"); 2897 return 1; 2898} 2899 2900LoopVectorizationCostModel::RegisterUsage 2901LoopVectorizationCostModel::calculateRegisterUsage() { 2902 // This function calculates the register usage by measuring the highest number 2903 // of values that are alive at a single location. Obviously, this is a very 2904 // rough estimation. We scan the loop in a topological order in order and 2905 // assign a number to each instruction. We use RPO to ensure that defs are 2906 // met before their users. We assume that each instruction that has in-loop 2907 // users starts an interval. We record every time that an in-loop value is 2908 // used, so we have a list of the first and last occurrences of each 2909 // instruction. Next, we transpose this data structure into a multi map that 2910 // holds the list of intervals that *end* at a specific location. This multi 2911 // map allows us to perform a linear search. We scan the instructions linearly 2912 // and record each time that a new interval starts, by placing it in a set. 2913 // If we find this value in the multi-map then we remove it from the set. 2914 // The max register usage is the maximum size of the set. 2915 // We also search for instructions that are defined outside the loop, but are 2916 // used inside the loop. We need this number separately from the max-interval 2917 // usage number because when we unroll, loop-invariant values do not take 2918 // more register. 2919 LoopBlocksDFS DFS(TheLoop); 2920 DFS.perform(LI); 2921 2922 RegisterUsage R; 2923 R.NumInstructions = 0; 2924 2925 // Each 'key' in the map opens a new interval. The values 2926 // of the map are the index of the 'last seen' usage of the 2927 // instruction that is the key. 2928 typedef DenseMap<Instruction*, unsigned> IntervalMap; 2929 // Maps instruction to its index. 2930 DenseMap<unsigned, Instruction*> IdxToInstr; 2931 // Marks the end of each interval. 2932 IntervalMap EndPoint; 2933 // Saves the list of instruction indices that are used in the loop. 2934 SmallSet<Instruction*, 8> Ends; 2935 // Saves the list of values that are used in the loop but are 2936 // defined outside the loop, such as arguments and constants. 2937 SmallPtrSet<Value*, 8> LoopInvariants; 2938 2939 unsigned Index = 0; 2940 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 2941 be = DFS.endRPO(); bb != be; ++bb) { 2942 R.NumInstructions += (*bb)->size(); 2943 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2944 ++it) { 2945 Instruction *I = it; 2946 IdxToInstr[Index++] = I; 2947 2948 // Save the end location of each USE. 2949 for (unsigned i = 0; i < I->getNumOperands(); ++i) { 2950 Value *U = I->getOperand(i); 2951 Instruction *Instr = dyn_cast<Instruction>(U); 2952 2953 // Ignore non-instruction values such as arguments, constants, etc. 2954 if (!Instr) continue; 2955 2956 // If this instruction is outside the loop then record it and continue. 2957 if (!TheLoop->contains(Instr)) { 2958 LoopInvariants.insert(Instr); 2959 continue; 2960 } 2961 2962 // Overwrite previous end points. 2963 EndPoint[Instr] = Index; 2964 Ends.insert(Instr); 2965 } 2966 } 2967 } 2968 2969 // Saves the list of intervals that end with the index in 'key'. 2970 typedef SmallVector<Instruction*, 2> InstrList; 2971 DenseMap<unsigned, InstrList> TransposeEnds; 2972 2973 // Transpose the EndPoints to a list of values that end at each index. 2974 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 2975 it != e; ++it) 2976 TransposeEnds[it->second].push_back(it->first); 2977 2978 SmallSet<Instruction*, 8> OpenIntervals; 2979 unsigned MaxUsage = 0; 2980 2981 2982 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 2983 for (unsigned int i = 0; i < Index; ++i) { 2984 Instruction *I = IdxToInstr[i]; 2985 // Ignore instructions that are never used within the loop. 2986 if (!Ends.count(I)) continue; 2987 2988 // Remove all of the instructions that end at this location. 2989 InstrList &List = TransposeEnds[i]; 2990 for (unsigned int j=0, e = List.size(); j < e; ++j) 2991 OpenIntervals.erase(List[j]); 2992 2993 // Count the number of live interals. 2994 MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 2995 2996 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 2997 OpenIntervals.size() <<"\n"); 2998 2999 // Add the current instruction to the list of open intervals. 3000 OpenIntervals.insert(I); 3001 } 3002 3003 unsigned Invariant = LoopInvariants.size(); 3004 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); 3005 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); 3006 DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); 3007 3008 R.LoopInvariantRegs = Invariant; 3009 R.MaxLocalUsers = MaxUsage; 3010 return R; 3011} 3012 3013unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 3014 unsigned Cost = 0; 3015 3016 // For each block. 3017 for (Loop::block_iterator bb = TheLoop->block_begin(), 3018 be = TheLoop->block_end(); bb != be; ++bb) { 3019 unsigned BlockCost = 0; 3020 BasicBlock *BB = *bb; 3021 3022 // For each instruction in the old loop. 3023 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3024 unsigned C = getInstructionCost(it, VF); 3025 Cost += C; 3026 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 3027 VF << " For instruction: "<< *it << "\n"); 3028 } 3029 3030 // We assume that if-converted blocks have a 50% chance of being executed. 3031 // When the code is scalar then some of the blocks are avoided due to CF. 3032 // When the code is vectorized we execute all code paths. 3033 if (Legal->blockNeedsPredication(*bb) && VF == 1) 3034 BlockCost /= 2; 3035 3036 Cost += BlockCost; 3037 } 3038 3039 return Cost; 3040} 3041 3042unsigned 3043LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 3044 // If we know that this instruction will remain uniform, check the cost of 3045 // the scalar version. 3046 if (Legal->isUniformAfterVectorization(I)) 3047 VF = 1; 3048 3049 Type *RetTy = I->getType(); 3050 Type *VectorTy = ToVectorTy(RetTy, VF); 3051 3052 // TODO: We need to estimate the cost of intrinsic calls. 3053 switch (I->getOpcode()) { 3054 case Instruction::GetElementPtr: 3055 // We mark this instruction as zero-cost because scalar GEPs are usually 3056 // lowered to the intruction addressing mode. At the moment we don't 3057 // generate vector geps. 3058 return 0; 3059 case Instruction::Br: { 3060 return TTI.getCFInstrCost(I->getOpcode()); 3061 } 3062 case Instruction::PHI: 3063 //TODO: IF-converted IFs become selects. 3064 return 0; 3065 case Instruction::Add: 3066 case Instruction::FAdd: 3067 case Instruction::Sub: 3068 case Instruction::FSub: 3069 case Instruction::Mul: 3070 case Instruction::FMul: 3071 case Instruction::UDiv: 3072 case Instruction::SDiv: 3073 case Instruction::FDiv: 3074 case Instruction::URem: 3075 case Instruction::SRem: 3076 case Instruction::FRem: 3077 case Instruction::Shl: 3078 case Instruction::LShr: 3079 case Instruction::AShr: 3080 case Instruction::And: 3081 case Instruction::Or: 3082 case Instruction::Xor: 3083 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); 3084 case Instruction::Select: { 3085 SelectInst *SI = cast<SelectInst>(I); 3086 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 3087 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 3088 Type *CondTy = SI->getCondition()->getType(); 3089 if (ScalarCond) 3090 CondTy = VectorType::get(CondTy, VF); 3091 3092 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 3093 } 3094 case Instruction::ICmp: 3095 case Instruction::FCmp: { 3096 Type *ValTy = I->getOperand(0)->getType(); 3097 VectorTy = ToVectorTy(ValTy, VF); 3098 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 3099 } 3100 case Instruction::Store: 3101 case Instruction::Load: { 3102 StoreInst *SI = dyn_cast<StoreInst>(I); 3103 LoadInst *LI = dyn_cast<LoadInst>(I); 3104 Type *ValTy = (SI ? SI->getValueOperand()->getType() : 3105 LI->getType()); 3106 VectorTy = ToVectorTy(ValTy, VF); 3107 3108 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 3109 unsigned AS = SI ? SI->getPointerAddressSpace() : 3110 LI->getPointerAddressSpace(); 3111 Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 3112 3113 if (VF == 1) 3114 return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 3115 3116 // Scalarized loads/stores. 3117 int Stride = Legal->isConsecutivePtr(Ptr); 3118 bool Reverse = Stride < 0; 3119 if (0 == Stride) { 3120 unsigned Cost = 0; 3121 // The cost of extracting from the value vector and pointer vector. 3122 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 3123 for (unsigned i = 0; i < VF; ++i) { 3124 // The cost of extracting the pointer operand. 3125 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 3126 // In case of STORE, the cost of ExtractElement from the vector. 3127 // In case of LOAD, the cost of InsertElement into the returned 3128 // vector. 3129 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 3130 Instruction::InsertElement, 3131 VectorTy, i); 3132 } 3133 3134 // The cost of the scalar stores. 3135 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 3136 Alignment, AS); 3137 return Cost; 3138 } 3139 3140 // Wide load/stores. 3141 unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, 3142 Alignment, AS); 3143 if (Reverse) 3144 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3145 VectorTy, 0); 3146 return Cost; 3147 } 3148 case Instruction::ZExt: 3149 case Instruction::SExt: 3150 case Instruction::FPToUI: 3151 case Instruction::FPToSI: 3152 case Instruction::FPExt: 3153 case Instruction::PtrToInt: 3154 case Instruction::IntToPtr: 3155 case Instruction::SIToFP: 3156 case Instruction::UIToFP: 3157 case Instruction::Trunc: 3158 case Instruction::FPTrunc: 3159 case Instruction::BitCast: { 3160 // We optimize the truncation of induction variable. 3161 // The cost of these is the same as the scalar operation. 3162 if (I->getOpcode() == Instruction::Trunc && 3163 Legal->isInductionVariable(I->getOperand(0))) 3164 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 3165 I->getOperand(0)->getType()); 3166 3167 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 3168 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 3169 } 3170 case Instruction::Call: { 3171 assert(isTriviallyVectorizableIntrinsic(I)); 3172 IntrinsicInst *II = cast<IntrinsicInst>(I); 3173 Type *RetTy = ToVectorTy(II->getType(), VF); 3174 SmallVector<Type*, 4> Tys; 3175 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 3176 Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); 3177 return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); 3178 } 3179 default: { 3180 // We are scalarizing the instruction. Return the cost of the scalar 3181 // instruction, plus the cost of insert and extract into vector 3182 // elements, times the vector width. 3183 unsigned Cost = 0; 3184 3185 if (!RetTy->isVoidTy() && VF != 1) { 3186 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 3187 VectorTy); 3188 unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 3189 VectorTy); 3190 3191 // The cost of inserting the results plus extracting each one of the 3192 // operands. 3193 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 3194 } 3195 3196 // The cost of executing VF copies of the scalar instruction. This opcode 3197 // is unknown. Assume that it is the same as 'mul'. 3198 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 3199 return Cost; 3200 } 3201 }// end of switch. 3202} 3203 3204Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 3205 if (Scalar->isVoidTy() || VF == 1) 3206 return Scalar; 3207 return VectorType::get(Scalar, VF); 3208} 3209 3210char LoopVectorize::ID = 0; 3211static const char lv_name[] = "Loop Vectorization"; 3212INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 3213INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3214INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3215INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3216INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 3217INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 3218 3219namespace llvm { 3220 Pass *createLoopVectorizePass() { 3221 return new LoopVectorize(); 3222 } 3223} 3224 3225bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 3226 // Check for a store. 3227 StoreInst *ST = dyn_cast<StoreInst>(Inst); 3228 if (ST) 3229 return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 3230 3231 // Check for a load. 3232 LoadInst *LI = dyn_cast<LoadInst>(Inst); 3233 if (LI) 3234 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 3235 3236 return false; 3237} 3238