LoopVectorize.cpp revision 0aad08adfda3ff7c4f9c453ba821057b5436c4ef
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. 12// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13// of instructions in order to estimate the profitability of vectorization. 14// 15// The loop vectorizer combines consecutive loop iterations into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. InnerLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28// 29//===----------------------------------------------------------------------===// 30// 31// The reduction-variable vectorization is based on the paper: 32// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33// 34// Variable uniformity checks are inspired by: 35// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36// 37// Other ideas/concepts are from: 38// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39// 40// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41// Vectorizing Compilers. 42// 43//===----------------------------------------------------------------------===// 44 45#define LV_NAME "loop-vectorize" 46#define DEBUG_TYPE LV_NAME 47 48#include "llvm/Transforms/Vectorize.h" 49#include "llvm/ADT/DenseMap.h" 50#include "llvm/ADT/MapVector.h" 51#include "llvm/ADT/SmallPtrSet.h" 52#include "llvm/ADT/SmallSet.h" 53#include "llvm/ADT/SmallVector.h" 54#include "llvm/ADT/StringExtras.h" 55#include "llvm/Analysis/AliasAnalysis.h" 56#include "llvm/Analysis/AliasSetTracker.h" 57#include "llvm/Analysis/Dominators.h" 58#include "llvm/Analysis/LoopInfo.h" 59#include "llvm/Analysis/LoopIterator.h" 60#include "llvm/Analysis/LoopPass.h" 61#include "llvm/Analysis/ScalarEvolution.h" 62#include "llvm/Analysis/ScalarEvolutionExpander.h" 63#include "llvm/Analysis/ScalarEvolutionExpressions.h" 64#include "llvm/Analysis/TargetTransformInfo.h" 65#include "llvm/Analysis/ValueTracking.h" 66#include "llvm/Analysis/Verifier.h" 67#include "llvm/IR/Constants.h" 68#include "llvm/IR/DataLayout.h" 69#include "llvm/IR/DerivedTypes.h" 70#include "llvm/IR/Function.h" 71#include "llvm/IR/IRBuilder.h" 72#include "llvm/IR/Instructions.h" 73#include "llvm/IR/IntrinsicInst.h" 74#include "llvm/IR/LLVMContext.h" 75#include "llvm/IR/Module.h" 76#include "llvm/IR/Type.h" 77#include "llvm/IR/Value.h" 78#include "llvm/Pass.h" 79#include "llvm/Support/CommandLine.h" 80#include "llvm/Support/Debug.h" 81#include "llvm/Support/PatternMatch.h" 82#include "llvm/Support/raw_ostream.h" 83#include "llvm/Target/TargetLibraryInfo.h" 84#include "llvm/Transforms/Scalar.h" 85#include "llvm/Transforms/Utils/BasicBlockUtils.h" 86#include "llvm/Transforms/Utils/Local.h" 87#include <algorithm> 88#include <map> 89 90using namespace llvm; 91using namespace llvm::PatternMatch; 92 93static cl::opt<unsigned> 94VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 95 cl::desc("Sets the SIMD width. Zero is autoselect.")); 96 97static cl::opt<unsigned> 98VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, 99 cl::desc("Sets the vectorization unroll count. " 100 "Zero is autoselect.")); 101 102static cl::opt<bool> 103EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 104 cl::desc("Enable if-conversion during vectorization.")); 105 106/// We don't vectorize loops with a known constant trip count below this number. 107static cl::opt<unsigned> 108TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 109 cl::Hidden, 110 cl::desc("Don't vectorize loops with a constant " 111 "trip count that is smaller than this " 112 "value.")); 113 114/// We don't unroll loops with a known constant trip count below this number. 115static const unsigned TinyTripCountUnrollThreshold = 128; 116 117/// When performing memory disambiguation checks at runtime do not make more 118/// than this number of comparisons. 119static const unsigned RuntimeMemoryCheckThreshold = 8; 120 121/// We use a metadata with this name to indicate that a scalar loop was 122/// vectorized and that we don't need to re-vectorize it if we run into it 123/// again. 124static const char* 125AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized"; 126 127namespace { 128 129// Forward declarations. 130class LoopVectorizationLegality; 131class LoopVectorizationCostModel; 132 133/// InnerLoopVectorizer vectorizes loops which contain only one basic 134/// block to a specified vectorization factor (VF). 135/// This class performs the widening of scalars into vectors, or multiple 136/// scalars. This class also implements the following features: 137/// * It inserts an epilogue loop for handling loops that don't have iteration 138/// counts that are known to be a multiple of the vectorization factor. 139/// * It handles the code generation for reduction variables. 140/// * Scalarization (implementation using scalars) of un-vectorizable 141/// instructions. 142/// InnerLoopVectorizer does not perform any vectorization-legality 143/// checks, and relies on the caller to check for the different legality 144/// aspects. The InnerLoopVectorizer relies on the 145/// LoopVectorizationLegality class to provide information about the induction 146/// and reduction variables that were found to a given vectorization factor. 147class InnerLoopVectorizer { 148public: 149 InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 150 DominatorTree *DT, DataLayout *DL, 151 const TargetLibraryInfo *TLI, unsigned VecWidth, 152 unsigned UnrollFactor) 153 : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), 154 VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), 155 OldInduction(0), WidenMap(UnrollFactor) {} 156 157 // Perform the actual loop widening (vectorization). 158 void vectorize(LoopVectorizationLegality *Legal) { 159 // Create a new empty loop. Unlink the old loop and connect the new one. 160 createEmptyLoop(Legal); 161 // Widen each instruction in the old loop to a new one in the new loop. 162 // Use the Legality module to find the induction and reduction variables. 163 vectorizeLoop(Legal); 164 // Register the new loop and update the analysis passes. 165 updateAnalysis(); 166 } 167 168private: 169 /// A small list of PHINodes. 170 typedef SmallVector<PHINode*, 4> PhiVector; 171 /// When we unroll loops we have multiple vector values for each scalar. 172 /// This data structure holds the unrolled and vectorized values that 173 /// originated from one scalar instruction. 174 typedef SmallVector<Value*, 2> VectorParts; 175 176 /// Add code that checks at runtime if the accessed arrays overlap. 177 /// Returns the comparator value or NULL if no check is needed. 178 Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, 179 Instruction *Loc); 180 /// Create an empty loop, based on the loop ranges of the old loop. 181 void createEmptyLoop(LoopVectorizationLegality *Legal); 182 /// Copy and widen the instructions from the old loop. 183 void vectorizeLoop(LoopVectorizationLegality *Legal); 184 185 /// A helper function that computes the predicate of the block BB, assuming 186 /// that the header block of the loop is set to True. It returns the *entry* 187 /// mask for the block BB. 188 VectorParts createBlockInMask(BasicBlock *BB); 189 /// A helper function that computes the predicate of the edge between SRC 190 /// and DST. 191 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 192 193 /// A helper function to vectorize a single BB within the innermost loop. 194 void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, 195 PhiVector *PV); 196 197 /// Insert the new loop to the loop hierarchy and pass manager 198 /// and update the analysis passes. 199 void updateAnalysis(); 200 201 /// This instruction is un-vectorizable. Implement it as a sequence 202 /// of scalars. 203 void scalarizeInstruction(Instruction *Instr); 204 205 /// Vectorize Load and Store instructions, 206 void vectorizeMemoryInstruction(Instruction *Instr, 207 LoopVectorizationLegality *Legal); 208 209 /// Create a broadcast instruction. This method generates a broadcast 210 /// instruction (shuffle) for loop invariant values and for the induction 211 /// value. If this is the induction variable then we extend it to N, N+1, ... 212 /// this is needed because each iteration in the loop corresponds to a SIMD 213 /// element. 214 Value *getBroadcastInstrs(Value *V); 215 216 /// This function adds 0, 1, 2 ... to each vector element, starting at zero. 217 /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). 218 /// The sequence starts at StartIndex. 219 Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); 220 221 /// When we go over instructions in the basic block we rely on previous 222 /// values within the current basic block or on loop invariant values. 223 /// When we widen (vectorize) values we place them in the map. If the values 224 /// are not within the map, they have to be loop invariant, so we simply 225 /// broadcast them into a vector. 226 VectorParts &getVectorValue(Value *V); 227 228 /// Generate a shuffle sequence that will reverse the vector Vec. 229 Value *reverseVector(Value *Vec); 230 231 /// This is a helper class that holds the vectorizer state. It maps scalar 232 /// instructions to vector instructions. When the code is 'unrolled' then 233 /// then a single scalar value is mapped to multiple vector parts. The parts 234 /// are stored in the VectorPart type. 235 struct ValueMap { 236 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 237 /// are mapped. 238 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 239 240 /// \return True if 'Key' is saved in the Value Map. 241 bool has(Value *Key) const { return MapStorage.count(Key); } 242 243 /// Initializes a new entry in the map. Sets all of the vector parts to the 244 /// save value in 'Val'. 245 /// \return A reference to a vector with splat values. 246 VectorParts &splat(Value *Key, Value *Val) { 247 VectorParts &Entry = MapStorage[Key]; 248 Entry.assign(UF, Val); 249 return Entry; 250 } 251 252 ///\return A reference to the value that is stored at 'Key'. 253 VectorParts &get(Value *Key) { 254 VectorParts &Entry = MapStorage[Key]; 255 if (Entry.empty()) 256 Entry.resize(UF); 257 assert(Entry.size() == UF); 258 return Entry; 259 } 260 261 private: 262 /// The unroll factor. Each entry in the map stores this number of vector 263 /// elements. 264 unsigned UF; 265 266 /// Map storage. We use std::map and not DenseMap because insertions to a 267 /// dense map invalidates its iterators. 268 std::map<Value *, VectorParts> MapStorage; 269 }; 270 271 /// The original loop. 272 Loop *OrigLoop; 273 /// Scev analysis to use. 274 ScalarEvolution *SE; 275 /// Loop Info. 276 LoopInfo *LI; 277 /// Dominator Tree. 278 DominatorTree *DT; 279 /// Data Layout. 280 DataLayout *DL; 281 /// Target Library Info. 282 const TargetLibraryInfo *TLI; 283 284 /// The vectorization SIMD factor to use. Each vector will have this many 285 /// vector elements. 286 unsigned VF; 287 /// The vectorization unroll factor to use. Each scalar is vectorized to this 288 /// many different vector instructions. 289 unsigned UF; 290 291 /// The builder that we use 292 IRBuilder<> Builder; 293 294 // --- Vectorization state --- 295 296 /// The vector-loop preheader. 297 BasicBlock *LoopVectorPreHeader; 298 /// The scalar-loop preheader. 299 BasicBlock *LoopScalarPreHeader; 300 /// Middle Block between the vector and the scalar. 301 BasicBlock *LoopMiddleBlock; 302 ///The ExitBlock of the scalar loop. 303 BasicBlock *LoopExitBlock; 304 ///The vector loop body. 305 BasicBlock *LoopVectorBody; 306 ///The scalar loop body. 307 BasicBlock *LoopScalarBody; 308 /// A list of all bypass blocks. The first block is the entry of the loop. 309 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 310 311 /// The new Induction variable which was added to the new block. 312 PHINode *Induction; 313 /// The induction variable of the old basic block. 314 PHINode *OldInduction; 315 /// Holds the extended (to the widest induction type) start index. 316 Value *ExtendedIdx; 317 /// Maps scalars to widened vectors. 318 ValueMap WidenMap; 319}; 320 321/// \brief Check if conditionally executed loads are hoistable. 322/// 323/// This class has two functions: isHoistableLoad and canHoistAllLoads. 324/// isHoistableLoad should be called on all load instructions that are executed 325/// conditionally. After all conditional loads are processed, the client should 326/// call canHoistAllLoads to determine if all of the conditional executed loads 327/// have an unconditional memory access to the same memory address in the loop. 328class LoadHoisting { 329 typedef SmallPtrSet<Value *, 8> MemorySet; 330 331 Loop *TheLoop; 332 DominatorTree *DT; 333 MemorySet CondLoadAddrSet; 334 335public: 336 LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {} 337 338 /// \brief Check if the instruction is a load with a identifiable address. 339 bool isHoistableLoad(Instruction *L); 340 341 /// \brief Check if all of the conditional loads are hoistable because there 342 /// exists an unconditional memory access to the same address in the loop. 343 bool canHoistAllLoads(); 344}; 345 346bool LoadHoisting::isHoistableLoad(Instruction *L) { 347 LoadInst *LI = dyn_cast<LoadInst>(L); 348 if (!LI) 349 return false; 350 351 CondLoadAddrSet.insert(LI->getPointerOperand()); 352 return true; 353} 354 355static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) { 356 for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { 357 Instruction *I = &*BI; 358 Value *Addr = 0; 359 360 // Try a load. 361 LoadInst *LI = dyn_cast<LoadInst>(I); 362 if (LI) { 363 Addr = LI->getPointerOperand(); 364 Set.insert(Addr); 365 continue; 366 } 367 368 // Try a store. 369 StoreInst *SI = dyn_cast<StoreInst>(I); 370 if (!SI) 371 continue; 372 373 Addr = SI->getPointerOperand(); 374 Set.insert(Addr); 375 } 376} 377 378bool LoadHoisting::canHoistAllLoads() { 379 // No conditional loads. 380 if (CondLoadAddrSet.empty()) 381 return true; 382 383 MemorySet UncondMemAccesses; 384 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 385 BasicBlock *LoopLatch = TheLoop->getLoopLatch(); 386 387 // Iterate over the unconditional blocks and collect memory access addresses. 388 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 389 BasicBlock *BB = LoopBlocks[i]; 390 391 // Ignore conditional blocks. 392 if (BB != LoopLatch && !DT->dominates(BB, LoopLatch)) 393 continue; 394 395 addMemAccesses(BB, UncondMemAccesses); 396 } 397 398 // And make sure there is a matching unconditional access for every 399 // conditional load. 400 for (MemorySet::iterator MI = CondLoadAddrSet.begin(), 401 ME = CondLoadAddrSet.end(); MI != ME; ++MI) 402 if (!UncondMemAccesses.count(*MI)) 403 return false; 404 405 return true; 406} 407 408/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 409/// to what vectorization factor. 410/// This class does not look at the profitability of vectorization, only the 411/// legality. This class has two main kinds of checks: 412/// * Memory checks - The code in canVectorizeMemory checks if vectorization 413/// will change the order of memory accesses in a way that will change the 414/// correctness of the program. 415/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 416/// checks for a number of different conditions, such as the availability of a 417/// single induction variable, that all types are supported and vectorize-able, 418/// etc. This code reflects the capabilities of InnerLoopVectorizer. 419/// This class is also used by InnerLoopVectorizer for identifying 420/// induction variable and the different reduction variables. 421class LoopVectorizationLegality { 422public: 423 LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, 424 DominatorTree *DT, TargetTransformInfo* TTI, 425 AliasAnalysis *AA, TargetLibraryInfo *TLI) 426 : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), 427 Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), 428 LoadSpeculation(L, DT) {} 429 430 /// This enum represents the kinds of reductions that we support. 431 enum ReductionKind { 432 RK_NoReduction, ///< Not a reduction. 433 RK_IntegerAdd, ///< Sum of integers. 434 RK_IntegerMult, ///< Product of integers. 435 RK_IntegerOr, ///< Bitwise or logical OR of numbers. 436 RK_IntegerAnd, ///< Bitwise or logical AND of numbers. 437 RK_IntegerXor, ///< Bitwise or logical XOR of numbers. 438 RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). 439 RK_FloatAdd, ///< Sum of floats. 440 RK_FloatMult, ///< Product of floats. 441 RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). 442 }; 443 444 /// This enum represents the kinds of inductions that we support. 445 enum InductionKind { 446 IK_NoInduction, ///< Not an induction variable. 447 IK_IntInduction, ///< Integer induction variable. Step = 1. 448 IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. 449 IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). 450 IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). 451 }; 452 453 // This enum represents the kind of minmax reduction. 454 enum MinMaxReductionKind { 455 MRK_Invalid, 456 MRK_UIntMin, 457 MRK_UIntMax, 458 MRK_SIntMin, 459 MRK_SIntMax, 460 MRK_FloatMin, 461 MRK_FloatMax 462 }; 463 464 /// This POD struct holds information about reduction variables. 465 struct ReductionDescriptor { 466 ReductionDescriptor() : StartValue(0), LoopExitInstr(0), 467 Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} 468 469 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, 470 MinMaxReductionKind MK) 471 : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} 472 473 // The starting value of the reduction. 474 // It does not have to be zero! 475 Value *StartValue; 476 // The instruction who's value is used outside the loop. 477 Instruction *LoopExitInstr; 478 // The kind of the reduction. 479 ReductionKind Kind; 480 // If this a min/max reduction the kind of reduction. 481 MinMaxReductionKind MinMaxKind; 482 }; 483 484 /// This POD struct holds information about a potential reduction operation. 485 struct ReductionInstDesc { 486 ReductionInstDesc(bool IsRedux, Instruction *I) : 487 IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} 488 489 ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : 490 IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} 491 492 // Is this instruction a reduction candidate. 493 bool IsReduction; 494 // The last instruction in a min/max pattern (select of the select(icmp()) 495 // pattern), or the current reduction instruction otherwise. 496 Instruction *PatternLastInst; 497 // If this is a min/max pattern the comparison predicate. 498 MinMaxReductionKind MinMaxKind; 499 }; 500 501 // This POD struct holds information about the memory runtime legality 502 // check that a group of pointers do not overlap. 503 struct RuntimePointerCheck { 504 RuntimePointerCheck() : Need(false) {} 505 506 /// Reset the state of the pointer runtime information. 507 void reset() { 508 Need = false; 509 Pointers.clear(); 510 Starts.clear(); 511 Ends.clear(); 512 } 513 514 /// Insert a pointer and calculate the start and end SCEVs. 515 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); 516 517 /// This flag indicates if we need to add the runtime check. 518 bool Need; 519 /// Holds the pointers that we need to check. 520 SmallVector<Value*, 2> Pointers; 521 /// Holds the pointer value at the beginning of the loop. 522 SmallVector<const SCEV*, 2> Starts; 523 /// Holds the pointer value at the end of the loop. 524 SmallVector<const SCEV*, 2> Ends; 525 /// Holds the information if this pointer is used for writing to memory. 526 SmallVector<bool, 2> IsWritePtr; 527 }; 528 529 /// A POD for saving information about induction variables. 530 struct InductionInfo { 531 InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} 532 InductionInfo() : StartValue(0), IK(IK_NoInduction) {} 533 /// Start value. 534 Value *StartValue; 535 /// Induction kind. 536 InductionKind IK; 537 }; 538 539 /// ReductionList contains the reduction descriptors for all 540 /// of the reductions that were found in the loop. 541 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 542 543 /// InductionList saves induction variables and maps them to the 544 /// induction descriptor. 545 typedef MapVector<PHINode*, InductionInfo> InductionList; 546 547 /// Alias(Multi)Map stores the values (GEPs or underlying objects and their 548 /// respective Store/Load instruction(s) to calculate aliasing. 549 typedef MapVector<Value*, Instruction* > AliasMap; 550 typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap; 551 552 /// Returns true if it is legal to vectorize this loop. 553 /// This does not mean that it is profitable to vectorize this 554 /// loop, only that it is legal to do so. 555 bool canVectorize(); 556 557 /// Returns the Induction variable. 558 PHINode *getInduction() { return Induction; } 559 560 /// Returns the reduction variables found in the loop. 561 ReductionList *getReductionVars() { return &Reductions; } 562 563 /// Returns the induction variables found in the loop. 564 InductionList *getInductionVars() { return &Inductions; } 565 566 /// Returns the widest induction type. 567 Type *getWidestInductionType() { return WidestIndTy; } 568 569 /// Returns True if V is an induction variable in this loop. 570 bool isInductionVariable(const Value *V); 571 572 /// Return true if the block BB needs to be predicated in order for the loop 573 /// to be vectorized. 574 bool blockNeedsPredication(BasicBlock *BB); 575 576 /// Check if this pointer is consecutive when vectorizing. This happens 577 /// when the last index of the GEP is the induction variable, or that the 578 /// pointer itself is an induction variable. 579 /// This check allows us to vectorize A[idx] into a wide load/store. 580 /// Returns: 581 /// 0 - Stride is unknown or non consecutive. 582 /// 1 - Address is consecutive. 583 /// -1 - Address is consecutive, and decreasing. 584 int isConsecutivePtr(Value *Ptr); 585 586 /// Returns true if the value V is uniform within the loop. 587 bool isUniform(Value *V); 588 589 /// Returns true if this instruction will remain scalar after vectorization. 590 bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 591 592 /// Returns the information that we collected about runtime memory check. 593 RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } 594 595 /// This function returns the identity element (or neutral element) for 596 /// the operation K. 597 static Constant *getReductionIdentity(ReductionKind K, Type *Tp); 598private: 599 /// Check if a single basic block loop is vectorizable. 600 /// At this point we know that this is a loop with a constant trip count 601 /// and we only need to check individual instructions. 602 bool canVectorizeInstrs(); 603 604 /// When we vectorize loops we may change the order in which 605 /// we read and write from memory. This method checks if it is 606 /// legal to vectorize the code, considering only memory constrains. 607 /// Returns true if the loop is vectorizable 608 bool canVectorizeMemory(); 609 610 /// Return true if we can vectorize this loop using the IF-conversion 611 /// transformation. 612 bool canVectorizeWithIfConvert(); 613 614 /// Collect the variables that need to stay uniform after vectorization. 615 void collectLoopUniforms(); 616 617 /// Return true if all of the instructions in the block can be speculatively 618 /// executed. 619 bool blockCanBePredicated(BasicBlock *BB); 620 621 /// Returns True, if 'Phi' is the kind of reduction variable for type 622 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 623 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 624 /// Returns a struct describing if the instruction 'I' can be a reduction 625 /// variable of type 'Kind'. If the reduction is a min/max pattern of 626 /// select(icmp()) this function advances the instruction pointer 'I' from the 627 /// compare instruction to the select instruction and stores this pointer in 628 /// 'PatternLastInst' member of the returned struct. 629 ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, 630 ReductionInstDesc &Desc); 631 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 632 /// pattern corresponding to a min(X, Y) or max(X, Y). 633 static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, 634 ReductionInstDesc &Prev); 635 /// Returns the induction kind of Phi. This function may return NoInduction 636 /// if the PHI is not an induction variable. 637 InductionKind isInductionVariable(PHINode *Phi); 638 /// Return true if can compute the address bounds of Ptr within the loop. 639 bool hasComputableBounds(Value *Ptr); 640 /// Return true if there is the chance of write reorder. 641 bool hasPossibleGlobalWriteReorder(Value *Object, 642 Instruction *Inst, 643 AliasMultiMap &WriteObjects, 644 unsigned MaxByteWidth); 645 /// Return the AA location for a load or a store. 646 AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); 647 648 649 /// The loop that we evaluate. 650 Loop *TheLoop; 651 /// Scev analysis. 652 ScalarEvolution *SE; 653 /// DataLayout analysis. 654 DataLayout *DL; 655 /// Dominators. 656 DominatorTree *DT; 657 /// Target Info. 658 TargetTransformInfo *TTI; 659 /// Alias Analysis. 660 AliasAnalysis *AA; 661 /// Target Library Info. 662 TargetLibraryInfo *TLI; 663 664 // --- vectorization state --- // 665 666 /// Holds the integer induction variable. This is the counter of the 667 /// loop. 668 PHINode *Induction; 669 /// Holds the reduction variables. 670 ReductionList Reductions; 671 /// Holds all of the induction variables that we found in the loop. 672 /// Notice that inductions don't need to start at zero and that induction 673 /// variables can be pointers. 674 InductionList Inductions; 675 /// Holds the widest induction type encountered. 676 Type *WidestIndTy; 677 678 /// Allowed outside users. This holds the reduction 679 /// vars which can be accessed from outside the loop. 680 SmallPtrSet<Value*, 4> AllowedExit; 681 /// This set holds the variables which are known to be uniform after 682 /// vectorization. 683 SmallPtrSet<Instruction*, 4> Uniforms; 684 /// We need to check that all of the pointers in this list are disjoint 685 /// at runtime. 686 RuntimePointerCheck PtrRtCheck; 687 /// Can we assume the absence of NaNs. 688 bool HasFunNoNaNAttr; 689 690 /// Utility to determine whether loads can be speculated. 691 LoadHoisting LoadSpeculation; 692}; 693 694/// LoopVectorizationCostModel - estimates the expected speedups due to 695/// vectorization. 696/// In many cases vectorization is not profitable. This can happen because of 697/// a number of reasons. In this class we mainly attempt to predict the 698/// expected speedup/slowdowns due to the supported instruction set. We use the 699/// TargetTransformInfo to query the different backends for the cost of 700/// different operations. 701class LoopVectorizationCostModel { 702public: 703 LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 704 LoopVectorizationLegality *Legal, 705 const TargetTransformInfo &TTI, 706 DataLayout *DL, const TargetLibraryInfo *TLI) 707 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} 708 709 /// Information about vectorization costs 710 struct VectorizationFactor { 711 unsigned Width; // Vector width with best cost 712 unsigned Cost; // Cost of the loop with that width 713 }; 714 /// \return The most profitable vectorization factor and the cost of that VF. 715 /// This method checks every power of two up to VF. If UserVF is not ZERO 716 /// then this vectorization factor will be selected if vectorization is 717 /// possible. 718 VectorizationFactor selectVectorizationFactor(bool OptForSize, 719 unsigned UserVF); 720 721 /// \return The size (in bits) of the widest type in the code that 722 /// needs to be vectorized. We ignore values that remain scalar such as 723 /// 64 bit loop indices. 724 unsigned getWidestType(); 725 726 /// \return The most profitable unroll factor. 727 /// If UserUF is non-zero then this method finds the best unroll-factor 728 /// based on register pressure and other parameters. 729 /// VF and LoopCost are the selected vectorization factor and the cost of the 730 /// selected VF. 731 unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, 732 unsigned LoopCost); 733 734 /// \brief A struct that represents some properties of the register usage 735 /// of a loop. 736 struct RegisterUsage { 737 /// Holds the number of loop invariant values that are used in the loop. 738 unsigned LoopInvariantRegs; 739 /// Holds the maximum number of concurrent live intervals in the loop. 740 unsigned MaxLocalUsers; 741 /// Holds the number of instructions in the loop. 742 unsigned NumInstructions; 743 }; 744 745 /// \return information about the register usage of the loop. 746 RegisterUsage calculateRegisterUsage(); 747 748private: 749 /// Returns the expected execution cost. The unit of the cost does 750 /// not matter because we use the 'cost' units to compare different 751 /// vector widths. The cost that is returned is *not* normalized by 752 /// the factor width. 753 unsigned expectedCost(unsigned VF); 754 755 /// Returns the execution time cost of an instruction for a given vector 756 /// width. Vector width of one means scalar. 757 unsigned getInstructionCost(Instruction *I, unsigned VF); 758 759 /// A helper function for converting Scalar types to vector types. 760 /// If the incoming type is void, we return void. If the VF is 1, we return 761 /// the scalar type. 762 static Type* ToVectorTy(Type *Scalar, unsigned VF); 763 764 /// Returns whether the instruction is a load or store and will be a emitted 765 /// as a vector operation. 766 bool isConsecutiveLoadOrStore(Instruction *I); 767 768 /// The loop that we evaluate. 769 Loop *TheLoop; 770 /// Scev analysis. 771 ScalarEvolution *SE; 772 /// Loop Info analysis. 773 LoopInfo *LI; 774 /// Vectorization legality. 775 LoopVectorizationLegality *Legal; 776 /// Vector target information. 777 const TargetTransformInfo &TTI; 778 /// Target data layout information. 779 DataLayout *DL; 780 /// Target Library Info. 781 const TargetLibraryInfo *TLI; 782}; 783 784/// The LoopVectorize Pass. 785struct LoopVectorize : public LoopPass { 786 /// Pass identification, replacement for typeid 787 static char ID; 788 789 explicit LoopVectorize() : LoopPass(ID) { 790 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 791 } 792 793 ScalarEvolution *SE; 794 DataLayout *DL; 795 LoopInfo *LI; 796 TargetTransformInfo *TTI; 797 DominatorTree *DT; 798 AliasAnalysis *AA; 799 TargetLibraryInfo *TLI; 800 801 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 802 // We only vectorize innermost loops. 803 if (!L->empty()) 804 return false; 805 806 SE = &getAnalysis<ScalarEvolution>(); 807 DL = getAnalysisIfAvailable<DataLayout>(); 808 LI = &getAnalysis<LoopInfo>(); 809 TTI = &getAnalysis<TargetTransformInfo>(); 810 DT = &getAnalysis<DominatorTree>(); 811 AA = getAnalysisIfAvailable<AliasAnalysis>(); 812 TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 813 814 if (DL == NULL) { 815 DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); 816 return false; 817 } 818 819 DEBUG(dbgs() << "LV: Checking a loop in \"" << 820 L->getHeader()->getParent()->getName() << "\"\n"); 821 822 // Check if it is legal to vectorize the loop. 823 LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); 824 if (!LVL.canVectorize()) { 825 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 826 return false; 827 } 828 829 // Use the cost model. 830 LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); 831 832 // Check the function attributes to find out if this function should be 833 // optimized for size. 834 Function *F = L->getHeader()->getParent(); 835 Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; 836 Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; 837 unsigned FnIndex = AttributeSet::FunctionIndex; 838 bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); 839 bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); 840 841 if (NoFloat) { 842 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 843 "attribute is used.\n"); 844 return false; 845 } 846 847 // Select the optimal vectorization factor. 848 LoopVectorizationCostModel::VectorizationFactor VF; 849 VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 850 // Select the unroll factor. 851 unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, 852 VF.Width, VF.Cost); 853 854 if (VF.Width == 1) { 855 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 856 return false; 857 } 858 859 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< 860 F->getParent()->getModuleIdentifier()<<"\n"); 861 DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); 862 863 // If we decided that it is *legal* to vectorize the loop then do it. 864 InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); 865 LB.vectorize(&LVL); 866 867 DEBUG(verifyFunction(*L->getHeader()->getParent())); 868 return true; 869 } 870 871 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 872 LoopPass::getAnalysisUsage(AU); 873 AU.addRequiredID(LoopSimplifyID); 874 AU.addRequiredID(LCSSAID); 875 AU.addRequired<DominatorTree>(); 876 AU.addRequired<LoopInfo>(); 877 AU.addRequired<ScalarEvolution>(); 878 AU.addRequired<TargetTransformInfo>(); 879 AU.addPreserved<LoopInfo>(); 880 AU.addPreserved<DominatorTree>(); 881 } 882 883}; 884 885} // end anonymous namespace 886 887//===----------------------------------------------------------------------===// 888// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 889// LoopVectorizationCostModel. 890//===----------------------------------------------------------------------===// 891 892void 893LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 894 Loop *Lp, Value *Ptr, 895 bool WritePtr) { 896 const SCEV *Sc = SE->getSCEV(Ptr); 897 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 898 assert(AR && "Invalid addrec expression"); 899 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 900 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 901 Pointers.push_back(Ptr); 902 Starts.push_back(AR->getStart()); 903 Ends.push_back(ScEnd); 904 IsWritePtr.push_back(WritePtr); 905} 906 907Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 908 // Save the current insertion location. 909 Instruction *Loc = Builder.GetInsertPoint(); 910 911 // We need to place the broadcast of invariant variables outside the loop. 912 Instruction *Instr = dyn_cast<Instruction>(V); 913 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 914 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 915 916 // Place the code for broadcasting invariant variables in the new preheader. 917 if (Invariant) 918 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 919 920 // Broadcast the scalar into all locations in the vector. 921 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 922 923 // Restore the builder insertion point. 924 if (Invariant) 925 Builder.SetInsertPoint(Loc); 926 927 return Shuf; 928} 929 930Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, 931 bool Negate) { 932 assert(Val->getType()->isVectorTy() && "Must be a vector"); 933 assert(Val->getType()->getScalarType()->isIntegerTy() && 934 "Elem must be an integer"); 935 // Create the types. 936 Type *ITy = Val->getType()->getScalarType(); 937 VectorType *Ty = cast<VectorType>(Val->getType()); 938 int VLen = Ty->getNumElements(); 939 SmallVector<Constant*, 8> Indices; 940 941 // Create a vector of consecutive numbers from zero to VF. 942 for (int i = 0; i < VLen; ++i) { 943 int64_t Idx = Negate ? (-i) : i; 944 Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); 945 } 946 947 // Add the consecutive indices to the vector value. 948 Constant *Cv = ConstantVector::get(Indices); 949 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 950 return Builder.CreateAdd(Val, Cv, "induction"); 951} 952 953int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 954 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 955 // Make sure that the pointer does not point to structs. 956 if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) 957 return 0; 958 959 // If this value is a pointer induction variable we know it is consecutive. 960 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 961 if (Phi && Inductions.count(Phi)) { 962 InductionInfo II = Inductions[Phi]; 963 if (IK_PtrInduction == II.IK) 964 return 1; 965 else if (IK_ReversePtrInduction == II.IK) 966 return -1; 967 } 968 969 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 970 if (!Gep) 971 return 0; 972 973 unsigned NumOperands = Gep->getNumOperands(); 974 Value *LastIndex = Gep->getOperand(NumOperands - 1); 975 976 Value *GpPtr = Gep->getPointerOperand(); 977 // If this GEP value is a consecutive pointer induction variable and all of 978 // the indices are constant then we know it is consecutive. We can 979 Phi = dyn_cast<PHINode>(GpPtr); 980 if (Phi && Inductions.count(Phi)) { 981 982 // Make sure that the pointer does not point to structs. 983 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 984 if (GepPtrType->getElementType()->isAggregateType()) 985 return 0; 986 987 // Make sure that all of the index operands are loop invariant. 988 for (unsigned i = 1; i < NumOperands; ++i) 989 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 990 return 0; 991 992 InductionInfo II = Inductions[Phi]; 993 if (IK_PtrInduction == II.IK) 994 return 1; 995 else if (IK_ReversePtrInduction == II.IK) 996 return -1; 997 } 998 999 // Check that all of the gep indices are uniform except for the last. 1000 for (unsigned i = 0; i < NumOperands - 1; ++i) 1001 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 1002 return 0; 1003 1004 // We can emit wide load/stores only if the last index is the induction 1005 // variable. 1006 const SCEV *Last = SE->getSCEV(LastIndex); 1007 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 1008 const SCEV *Step = AR->getStepRecurrence(*SE); 1009 1010 // The memory is consecutive because the last index is consecutive 1011 // and all other indices are loop invariant. 1012 if (Step->isOne()) 1013 return 1; 1014 if (Step->isAllOnesValue()) 1015 return -1; 1016 } 1017 1018 return 0; 1019} 1020 1021bool LoopVectorizationLegality::isUniform(Value *V) { 1022 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 1023} 1024 1025InnerLoopVectorizer::VectorParts& 1026InnerLoopVectorizer::getVectorValue(Value *V) { 1027 assert(V != Induction && "The new induction variable should not be used."); 1028 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1029 1030 // If we have this scalar in the map, return it. 1031 if (WidenMap.has(V)) 1032 return WidenMap.get(V); 1033 1034 // If this scalar is unknown, assume that it is a constant or that it is 1035 // loop invariant. Broadcast V and save the value for future uses. 1036 Value *B = getBroadcastInstrs(V); 1037 return WidenMap.splat(V, B); 1038} 1039 1040Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 1041 assert(Vec->getType()->isVectorTy() && "Invalid type"); 1042 SmallVector<Constant*, 8> ShuffleMask; 1043 for (unsigned i = 0; i < VF; ++i) 1044 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 1045 1046 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 1047 ConstantVector::get(ShuffleMask), 1048 "reverse"); 1049} 1050 1051 1052void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 1053 LoopVectorizationLegality *Legal) { 1054 // Attempt to issue a wide load. 1055 LoadInst *LI = dyn_cast<LoadInst>(Instr); 1056 StoreInst *SI = dyn_cast<StoreInst>(Instr); 1057 1058 assert((LI || SI) && "Invalid Load/Store instruction"); 1059 1060 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 1061 Type *DataTy = VectorType::get(ScalarDataTy, VF); 1062 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 1063 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 1064 1065 unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); 1066 unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; 1067 1068 if (ScalarAllocatedSize != VectorElementSize) 1069 return scalarizeInstruction(Instr); 1070 1071 // If the pointer is loop invariant or if it is non consecutive, 1072 // scalarize the load. 1073 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 1074 bool Reverse = ConsecutiveStride < 0; 1075 bool UniformLoad = LI && Legal->isUniform(Ptr); 1076 if (!ConsecutiveStride || UniformLoad) 1077 return scalarizeInstruction(Instr); 1078 1079 Constant *Zero = Builder.getInt32(0); 1080 VectorParts &Entry = WidenMap.get(Instr); 1081 1082 // Handle consecutive loads/stores. 1083 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1084 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 1085 Value *PtrOperand = Gep->getPointerOperand(); 1086 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 1087 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 1088 1089 // Create the new GEP with the new induction variable. 1090 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1091 Gep2->setOperand(0, FirstBasePtr); 1092 Gep2->setName("gep.indvar.base"); 1093 Ptr = Builder.Insert(Gep2); 1094 } else if (Gep) { 1095 assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), 1096 OrigLoop) && "Base ptr must be invariant"); 1097 1098 // The last index does not have to be the induction. It can be 1099 // consecutive and be a function of the index. For example A[I+1]; 1100 unsigned NumOperands = Gep->getNumOperands(); 1101 1102 Value *LastGepOperand = Gep->getOperand(NumOperands - 1); 1103 VectorParts &GEPParts = getVectorValue(LastGepOperand); 1104 Value *LastIndex = GEPParts[0]; 1105 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1106 1107 // Create the new GEP with the new induction variable. 1108 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1109 Gep2->setOperand(NumOperands - 1, LastIndex); 1110 Gep2->setName("gep.indvar.idx"); 1111 Ptr = Builder.Insert(Gep2); 1112 } else { 1113 // Use the induction element ptr. 1114 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1115 VectorParts &PtrVal = getVectorValue(Ptr); 1116 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 1117 } 1118 1119 // Handle Stores: 1120 if (SI) { 1121 assert(!Legal->isUniform(SI->getPointerOperand()) && 1122 "We do not allow storing to uniform addresses"); 1123 1124 VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); 1125 for (unsigned Part = 0; Part < UF; ++Part) { 1126 // Calculate the pointer for the specific unroll-part. 1127 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1128 1129 if (Reverse) { 1130 // If we store to reverse consecutive memory locations then we need 1131 // to reverse the order of elements in the stored value. 1132 StoredVal[Part] = reverseVector(StoredVal[Part]); 1133 // If the address is consecutive but reversed, then the 1134 // wide store needs to start at the last vector element. 1135 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1136 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1137 } 1138 1139 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 1140 Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); 1141 } 1142 } 1143 1144 for (unsigned Part = 0; Part < UF; ++Part) { 1145 // Calculate the pointer for the specific unroll-part. 1146 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1147 1148 if (Reverse) { 1149 // If the address is consecutive but reversed, then the 1150 // wide store needs to start at the last vector element. 1151 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1152 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1153 } 1154 1155 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 1156 Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); 1157 cast<LoadInst>(LI)->setAlignment(Alignment); 1158 Entry[Part] = Reverse ? reverseVector(LI) : LI; 1159 } 1160} 1161 1162void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 1163 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 1164 // Holds vector parameters or scalars, in case of uniform vals. 1165 SmallVector<VectorParts, 4> Params; 1166 1167 // Find all of the vectorized parameters. 1168 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1169 Value *SrcOp = Instr->getOperand(op); 1170 1171 // If we are accessing the old induction variable, use the new one. 1172 if (SrcOp == OldInduction) { 1173 Params.push_back(getVectorValue(SrcOp)); 1174 continue; 1175 } 1176 1177 // Try using previously calculated values. 1178 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 1179 1180 // If the src is an instruction that appeared earlier in the basic block 1181 // then it should already be vectorized. 1182 if (SrcInst && OrigLoop->contains(SrcInst)) { 1183 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 1184 // The parameter is a vector value from earlier. 1185 Params.push_back(WidenMap.get(SrcInst)); 1186 } else { 1187 // The parameter is a scalar from outside the loop. Maybe even a constant. 1188 VectorParts Scalars; 1189 Scalars.append(UF, SrcOp); 1190 Params.push_back(Scalars); 1191 } 1192 } 1193 1194 assert(Params.size() == Instr->getNumOperands() && 1195 "Invalid number of operands"); 1196 1197 // Does this instruction return a value ? 1198 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 1199 1200 Value *UndefVec = IsVoidRetTy ? 0 : 1201 UndefValue::get(VectorType::get(Instr->getType(), VF)); 1202 // Create a new entry in the WidenMap and initialize it to Undef or Null. 1203 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 1204 1205 // For each vector unroll 'part': 1206 for (unsigned Part = 0; Part < UF; ++Part) { 1207 // For each scalar that we create: 1208 for (unsigned Width = 0; Width < VF; ++Width) { 1209 Instruction *Cloned = Instr->clone(); 1210 if (!IsVoidRetTy) 1211 Cloned->setName(Instr->getName() + ".cloned"); 1212 // Replace the operands of the cloned instrucions with extracted scalars. 1213 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1214 Value *Op = Params[op][Part]; 1215 // Param is a vector. Need to extract the right lane. 1216 if (Op->getType()->isVectorTy()) 1217 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 1218 Cloned->setOperand(op, Op); 1219 } 1220 1221 // Place the cloned scalar in the new loop. 1222 Builder.Insert(Cloned); 1223 1224 // If the original scalar returns a value we need to place it in a vector 1225 // so that future users will be able to use it. 1226 if (!IsVoidRetTy) 1227 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 1228 Builder.getInt32(Width)); 1229 } 1230 } 1231} 1232 1233Instruction * 1234InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 1235 Instruction *Loc) { 1236 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 1237 Legal->getRuntimePointerCheck(); 1238 1239 if (!PtrRtCheck->Need) 1240 return NULL; 1241 1242 Instruction *MemoryRuntimeCheck = 0; 1243 unsigned NumPointers = PtrRtCheck->Pointers.size(); 1244 SmallVector<Value* , 2> Starts; 1245 SmallVector<Value* , 2> Ends; 1246 1247 SCEVExpander Exp(*SE, "induction"); 1248 1249 // Use this type for pointer arithmetic. 1250 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 1251 1252 for (unsigned i = 0; i < NumPointers; ++i) { 1253 Value *Ptr = PtrRtCheck->Pointers[i]; 1254 const SCEV *Sc = SE->getSCEV(Ptr); 1255 1256 if (SE->isLoopInvariant(Sc, OrigLoop)) { 1257 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 1258 *Ptr <<"\n"); 1259 Starts.push_back(Ptr); 1260 Ends.push_back(Ptr); 1261 } else { 1262 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 1263 1264 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 1265 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 1266 Starts.push_back(Start); 1267 Ends.push_back(End); 1268 } 1269 } 1270 1271 IRBuilder<> ChkBuilder(Loc); 1272 1273 for (unsigned i = 0; i < NumPointers; ++i) { 1274 for (unsigned j = i+1; j < NumPointers; ++j) { 1275 // No need to check if two readonly pointers intersect. 1276 if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) 1277 continue; 1278 1279 Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); 1280 Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); 1281 Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); 1282 Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); 1283 1284 Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); 1285 Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); 1286 Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 1287 if (MemoryRuntimeCheck) 1288 IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, 1289 "conflict.rdx"); 1290 1291 MemoryRuntimeCheck = cast<Instruction>(IsConflict); 1292 } 1293 } 1294 1295 return MemoryRuntimeCheck; 1296} 1297 1298void 1299InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 1300 /* 1301 In this function we generate a new loop. The new loop will contain 1302 the vectorized instructions while the old loop will continue to run the 1303 scalar remainder. 1304 1305 [ ] <-- vector loop bypass (may consist of multiple blocks). 1306 / | 1307 / v 1308 | [ ] <-- vector pre header. 1309 | | 1310 | v 1311 | [ ] \ 1312 | [ ]_| <-- vector loop. 1313 | | 1314 \ v 1315 >[ ] <--- middle-block. 1316 / | 1317 / v 1318 | [ ] <--- new preheader. 1319 | | 1320 | v 1321 | [ ] \ 1322 | [ ]_| <-- old scalar loop to handle remainder. 1323 \ | 1324 \ v 1325 >[ ] <-- exit block. 1326 ... 1327 */ 1328 1329 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 1330 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 1331 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 1332 assert(ExitBlock && "Must have an exit block"); 1333 1334 // Mark the old scalar loop with metadata that tells us not to vectorize this 1335 // loop again if we run into it. 1336 MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None); 1337 OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); 1338 1339 // Some loops have a single integer induction variable, while other loops 1340 // don't. One example is c++ iterators that often have multiple pointer 1341 // induction variables. In the code below we also support a case where we 1342 // don't have a single induction variable. 1343 OldInduction = Legal->getInduction(); 1344 Type *IdxTy = Legal->getWidestInductionType(); 1345 1346 // Find the loop boundaries. 1347 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 1348 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 1349 1350 // Get the total trip count from the count by adding 1. 1351 ExitCount = SE->getAddExpr(ExitCount, 1352 SE->getConstant(ExitCount->getType(), 1)); 1353 1354 // Expand the trip count and place the new instructions in the preheader. 1355 // Notice that the pre-header does not change, only the loop body. 1356 SCEVExpander Exp(*SE, "induction"); 1357 1358 // Count holds the overall loop count (N). 1359 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 1360 BypassBlock->getTerminator()); 1361 1362 // The loop index does not have to start at Zero. Find the original start 1363 // value from the induction PHI node. If we don't have an induction variable 1364 // then we know that it starts at zero. 1365 Builder.SetInsertPoint(BypassBlock->getTerminator()); 1366 Value *StartIdx = ExtendedIdx = OldInduction ? 1367 Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), 1368 IdxTy): 1369 ConstantInt::get(IdxTy, 0); 1370 1371 assert(BypassBlock && "Invalid loop structure"); 1372 LoopBypassBlocks.push_back(BypassBlock); 1373 1374 // Split the single block loop into the two loop structure described above. 1375 BasicBlock *VectorPH = 1376 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 1377 BasicBlock *VecBody = 1378 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 1379 BasicBlock *MiddleBlock = 1380 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 1381 BasicBlock *ScalarPH = 1382 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 1383 1384 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 1385 // inside the loop. 1386 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1387 1388 // Generate the induction variable. 1389 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 1390 // The loop step is equal to the vectorization factor (num of SIMD elements) 1391 // times the unroll factor (num of SIMD instructions). 1392 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 1393 1394 // This is the IR builder that we use to add all of the logic for bypassing 1395 // the new vector loop. 1396 IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); 1397 1398 // We may need to extend the index in case there is a type mismatch. 1399 // We know that the count starts at zero and does not overflow. 1400 if (Count->getType() != IdxTy) { 1401 // The exit count can be of pointer type. Convert it to the correct 1402 // integer type. 1403 if (ExitCount->getType()->isPointerTy()) 1404 Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); 1405 else 1406 Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); 1407 } 1408 1409 // Add the start index to the loop count to get the new end index. 1410 Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); 1411 1412 // Now we need to generate the expression for N - (N % VF), which is 1413 // the part that the vectorized body will execute. 1414 Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); 1415 Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); 1416 Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, 1417 "end.idx.rnd.down"); 1418 1419 // Now, compare the new count to zero. If it is zero skip the vector loop and 1420 // jump to the scalar loop. 1421 Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, 1422 "cmp.zero"); 1423 1424 BasicBlock *LastBypassBlock = BypassBlock; 1425 1426 // Generate the code that checks in runtime if arrays overlap. We put the 1427 // checks into a separate block to make the more common case of few elements 1428 // faster. 1429 Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, 1430 BypassBlock->getTerminator()); 1431 if (MemRuntimeCheck) { 1432 // Create a new block containing the memory check. 1433 BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, 1434 "vector.memcheck"); 1435 LoopBypassBlocks.push_back(CheckBlock); 1436 1437 // Replace the branch into the memory check block with a conditional branch 1438 // for the "few elements case". 1439 Instruction *OldTerm = BypassBlock->getTerminator(); 1440 BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); 1441 OldTerm->eraseFromParent(); 1442 1443 Cmp = MemRuntimeCheck; 1444 LastBypassBlock = CheckBlock; 1445 } 1446 1447 LastBypassBlock->getTerminator()->eraseFromParent(); 1448 BranchInst::Create(MiddleBlock, VectorPH, Cmp, 1449 LastBypassBlock); 1450 1451 // We are going to resume the execution of the scalar loop. 1452 // Go over all of the induction variables that we found and fix the 1453 // PHIs that are left in the scalar version of the loop. 1454 // The starting values of PHI nodes depend on the counter of the last 1455 // iteration in the vectorized loop. 1456 // If we come from a bypass edge then we need to start from the original 1457 // start value. 1458 1459 // This variable saves the new starting index for the scalar loop. 1460 PHINode *ResumeIndex = 0; 1461 LoopVectorizationLegality::InductionList::iterator I, E; 1462 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 1463 // Set builder to point to last bypass block. 1464 BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); 1465 for (I = List->begin(), E = List->end(); I != E; ++I) { 1466 PHINode *OrigPhi = I->first; 1467 LoopVectorizationLegality::InductionInfo II = I->second; 1468 1469 Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); 1470 PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", 1471 MiddleBlock->getTerminator()); 1472 // We might have extended the type of the induction variable but we need a 1473 // truncated version for the scalar loop. 1474 PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? 1475 PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", 1476 MiddleBlock->getTerminator()) : 0; 1477 1478 Value *EndValue = 0; 1479 switch (II.IK) { 1480 case LoopVectorizationLegality::IK_NoInduction: 1481 llvm_unreachable("Unknown induction"); 1482 case LoopVectorizationLegality::IK_IntInduction: { 1483 // Handle the integer induction counter. 1484 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 1485 1486 // We have the canonical induction variable. 1487 if (OrigPhi == OldInduction) { 1488 // Create a truncated version of the resume value for the scalar loop, 1489 // we might have promoted the type to a larger width. 1490 EndValue = 1491 BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); 1492 // The new PHI merges the original incoming value, in case of a bypass, 1493 // or the value at the end of the vectorized loop. 1494 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1495 TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1496 TruncResumeVal->addIncoming(EndValue, VecBody); 1497 1498 // We know what the end value is. 1499 EndValue = IdxEndRoundDown; 1500 // We also know which PHI node holds it. 1501 ResumeIndex = ResumeVal; 1502 break; 1503 } 1504 1505 // Not the canonical induction variable - add the vector loop count to the 1506 // start value. 1507 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 1508 II.StartValue->getType(), 1509 "cast.crd"); 1510 EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); 1511 break; 1512 } 1513 case LoopVectorizationLegality::IK_ReverseIntInduction: { 1514 // Convert the CountRoundDown variable to the PHI size. 1515 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 1516 II.StartValue->getType(), 1517 "cast.crd"); 1518 // Handle reverse integer induction counter. 1519 EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); 1520 break; 1521 } 1522 case LoopVectorizationLegality::IK_PtrInduction: { 1523 // For pointer induction variables, calculate the offset using 1524 // the end index. 1525 EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, 1526 "ptr.ind.end"); 1527 break; 1528 } 1529 case LoopVectorizationLegality::IK_ReversePtrInduction: { 1530 // The value at the end of the loop for the reverse pointer is calculated 1531 // by creating a GEP with a negative index starting from the start value. 1532 Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); 1533 Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, 1534 "rev.ind.end"); 1535 EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, 1536 "rev.ptr.ind.end"); 1537 break; 1538 } 1539 }// end of case 1540 1541 // The new PHI merges the original incoming value, in case of a bypass, 1542 // or the value at the end of the vectorized loop. 1543 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { 1544 if (OrigPhi == OldInduction) 1545 ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); 1546 else 1547 ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1548 } 1549 ResumeVal->addIncoming(EndValue, VecBody); 1550 1551 // Fix the scalar body counter (PHI node). 1552 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 1553 // The old inductions phi node in the scalar body needs the truncated value. 1554 if (OrigPhi == OldInduction) 1555 OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); 1556 else 1557 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 1558 } 1559 1560 // If we are generating a new induction variable then we also need to 1561 // generate the code that calculates the exit value. This value is not 1562 // simply the end of the counter because we may skip the vectorized body 1563 // in case of a runtime check. 1564 if (!OldInduction){ 1565 assert(!ResumeIndex && "Unexpected resume value found"); 1566 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 1567 MiddleBlock->getTerminator()); 1568 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1569 ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); 1570 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 1571 } 1572 1573 // Make sure that we found the index where scalar loop needs to continue. 1574 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 1575 "Invalid resume Index"); 1576 1577 // Add a check in the middle block to see if we have completed 1578 // all of the iterations in the first vector loop. 1579 // If (N - N%VF) == N, then we *don't* need to run the remainder. 1580 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 1581 ResumeIndex, "cmp.n", 1582 MiddleBlock->getTerminator()); 1583 1584 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 1585 // Remove the old terminator. 1586 MiddleBlock->getTerminator()->eraseFromParent(); 1587 1588 // Create i+1 and fill the PHINode. 1589 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 1590 Induction->addIncoming(StartIdx, VectorPH); 1591 Induction->addIncoming(NextIdx, VecBody); 1592 // Create the compare. 1593 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 1594 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 1595 1596 // Now we have two terminators. Remove the old one from the block. 1597 VecBody->getTerminator()->eraseFromParent(); 1598 1599 // Get ready to start creating new instructions into the vectorized body. 1600 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1601 1602 // Create and register the new vector loop. 1603 Loop* Lp = new Loop(); 1604 Loop *ParentLoop = OrigLoop->getParentLoop(); 1605 1606 // Insert the new loop into the loop nest and register the new basic blocks. 1607 if (ParentLoop) { 1608 ParentLoop->addChildLoop(Lp); 1609 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 1610 ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); 1611 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1612 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1613 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1614 } else { 1615 LI->addTopLevelLoop(Lp); 1616 } 1617 1618 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1619 1620 // Save the state. 1621 LoopVectorPreHeader = VectorPH; 1622 LoopScalarPreHeader = ScalarPH; 1623 LoopMiddleBlock = MiddleBlock; 1624 LoopExitBlock = ExitBlock; 1625 LoopVectorBody = VecBody; 1626 LoopScalarBody = OldBasicBlock; 1627} 1628 1629/// This function returns the identity element (or neutral element) for 1630/// the operation K. 1631Constant* 1632LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { 1633 switch (K) { 1634 case RK_IntegerXor: 1635 case RK_IntegerAdd: 1636 case RK_IntegerOr: 1637 // Adding, Xoring, Oring zero to a number does not change it. 1638 return ConstantInt::get(Tp, 0); 1639 case RK_IntegerMult: 1640 // Multiplying a number by 1 does not change it. 1641 return ConstantInt::get(Tp, 1); 1642 case RK_IntegerAnd: 1643 // AND-ing a number with an all-1 value does not change it. 1644 return ConstantInt::get(Tp, -1, true); 1645 case RK_FloatMult: 1646 // Multiplying a number by 1 does not change it. 1647 return ConstantFP::get(Tp, 1.0L); 1648 case RK_FloatAdd: 1649 // Adding zero to a number does not change it. 1650 return ConstantFP::get(Tp, 0.0L); 1651 default: 1652 llvm_unreachable("Unknown reduction kind"); 1653 } 1654} 1655 1656static Intrinsic::ID 1657getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { 1658 // If we have an intrinsic call, check if it is trivially vectorizable. 1659 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 1660 switch (II->getIntrinsicID()) { 1661 case Intrinsic::sqrt: 1662 case Intrinsic::sin: 1663 case Intrinsic::cos: 1664 case Intrinsic::exp: 1665 case Intrinsic::exp2: 1666 case Intrinsic::log: 1667 case Intrinsic::log10: 1668 case Intrinsic::log2: 1669 case Intrinsic::fabs: 1670 case Intrinsic::floor: 1671 case Intrinsic::ceil: 1672 case Intrinsic::trunc: 1673 case Intrinsic::rint: 1674 case Intrinsic::nearbyint: 1675 case Intrinsic::pow: 1676 case Intrinsic::fma: 1677 case Intrinsic::fmuladd: 1678 return II->getIntrinsicID(); 1679 default: 1680 return Intrinsic::not_intrinsic; 1681 } 1682 } 1683 1684 if (!TLI) 1685 return Intrinsic::not_intrinsic; 1686 1687 LibFunc::Func Func; 1688 Function *F = CI->getCalledFunction(); 1689 // We're going to make assumptions on the semantics of the functions, check 1690 // that the target knows that it's available in this environment. 1691 if (!F || !TLI->getLibFunc(F->getName(), Func)) 1692 return Intrinsic::not_intrinsic; 1693 1694 // Otherwise check if we have a call to a function that can be turned into a 1695 // vector intrinsic. 1696 switch (Func) { 1697 default: 1698 break; 1699 case LibFunc::sin: 1700 case LibFunc::sinf: 1701 case LibFunc::sinl: 1702 return Intrinsic::sin; 1703 case LibFunc::cos: 1704 case LibFunc::cosf: 1705 case LibFunc::cosl: 1706 return Intrinsic::cos; 1707 case LibFunc::exp: 1708 case LibFunc::expf: 1709 case LibFunc::expl: 1710 return Intrinsic::exp; 1711 case LibFunc::exp2: 1712 case LibFunc::exp2f: 1713 case LibFunc::exp2l: 1714 return Intrinsic::exp2; 1715 case LibFunc::log: 1716 case LibFunc::logf: 1717 case LibFunc::logl: 1718 return Intrinsic::log; 1719 case LibFunc::log10: 1720 case LibFunc::log10f: 1721 case LibFunc::log10l: 1722 return Intrinsic::log10; 1723 case LibFunc::log2: 1724 case LibFunc::log2f: 1725 case LibFunc::log2l: 1726 return Intrinsic::log2; 1727 case LibFunc::fabs: 1728 case LibFunc::fabsf: 1729 case LibFunc::fabsl: 1730 return Intrinsic::fabs; 1731 case LibFunc::floor: 1732 case LibFunc::floorf: 1733 case LibFunc::floorl: 1734 return Intrinsic::floor; 1735 case LibFunc::ceil: 1736 case LibFunc::ceilf: 1737 case LibFunc::ceill: 1738 return Intrinsic::ceil; 1739 case LibFunc::trunc: 1740 case LibFunc::truncf: 1741 case LibFunc::truncl: 1742 return Intrinsic::trunc; 1743 case LibFunc::rint: 1744 case LibFunc::rintf: 1745 case LibFunc::rintl: 1746 return Intrinsic::rint; 1747 case LibFunc::nearbyint: 1748 case LibFunc::nearbyintf: 1749 case LibFunc::nearbyintl: 1750 return Intrinsic::nearbyint; 1751 case LibFunc::pow: 1752 case LibFunc::powf: 1753 case LibFunc::powl: 1754 return Intrinsic::pow; 1755 } 1756 1757 return Intrinsic::not_intrinsic; 1758} 1759 1760/// This function translates the reduction kind to an LLVM binary operator. 1761static unsigned 1762getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { 1763 switch (Kind) { 1764 case LoopVectorizationLegality::RK_IntegerAdd: 1765 return Instruction::Add; 1766 case LoopVectorizationLegality::RK_IntegerMult: 1767 return Instruction::Mul; 1768 case LoopVectorizationLegality::RK_IntegerOr: 1769 return Instruction::Or; 1770 case LoopVectorizationLegality::RK_IntegerAnd: 1771 return Instruction::And; 1772 case LoopVectorizationLegality::RK_IntegerXor: 1773 return Instruction::Xor; 1774 case LoopVectorizationLegality::RK_FloatMult: 1775 return Instruction::FMul; 1776 case LoopVectorizationLegality::RK_FloatAdd: 1777 return Instruction::FAdd; 1778 case LoopVectorizationLegality::RK_IntegerMinMax: 1779 return Instruction::ICmp; 1780 case LoopVectorizationLegality::RK_FloatMinMax: 1781 return Instruction::FCmp; 1782 default: 1783 llvm_unreachable("Unknown reduction operation"); 1784 } 1785} 1786 1787Value *createMinMaxOp(IRBuilder<> &Builder, 1788 LoopVectorizationLegality::MinMaxReductionKind RK, 1789 Value *Left, 1790 Value *Right) { 1791 CmpInst::Predicate P = CmpInst::ICMP_NE; 1792 switch (RK) { 1793 default: 1794 llvm_unreachable("Unknown min/max reduction kind"); 1795 case LoopVectorizationLegality::MRK_UIntMin: 1796 P = CmpInst::ICMP_ULT; 1797 break; 1798 case LoopVectorizationLegality::MRK_UIntMax: 1799 P = CmpInst::ICMP_UGT; 1800 break; 1801 case LoopVectorizationLegality::MRK_SIntMin: 1802 P = CmpInst::ICMP_SLT; 1803 break; 1804 case LoopVectorizationLegality::MRK_SIntMax: 1805 P = CmpInst::ICMP_SGT; 1806 break; 1807 case LoopVectorizationLegality::MRK_FloatMin: 1808 P = CmpInst::FCMP_OLT; 1809 break; 1810 case LoopVectorizationLegality::MRK_FloatMax: 1811 P = CmpInst::FCMP_OGT; 1812 break; 1813 } 1814 1815 Value *Cmp; 1816 if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) 1817 Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); 1818 else 1819 Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); 1820 1821 Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); 1822 return Select; 1823} 1824 1825void 1826InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1827 //===------------------------------------------------===// 1828 // 1829 // Notice: any optimization or new instruction that go 1830 // into the code below should be also be implemented in 1831 // the cost-model. 1832 // 1833 //===------------------------------------------------===// 1834 Constant *Zero = Builder.getInt32(0); 1835 1836 // In order to support reduction variables we need to be able to vectorize 1837 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1838 // stages. First, we create a new vector PHI node with no incoming edges. 1839 // We use this value when we vectorize all of the instructions that use the 1840 // PHI. Next, after all of the instructions in the block are complete we 1841 // add the new incoming edges to the PHI. At this point all of the 1842 // instructions in the basic block are vectorized, so we can use them to 1843 // construct the PHI. 1844 PhiVector RdxPHIsToFix; 1845 1846 // Scan the loop in a topological order to ensure that defs are vectorized 1847 // before users. 1848 LoopBlocksDFS DFS(OrigLoop); 1849 DFS.perform(LI); 1850 1851 // Vectorize all of the blocks in the original loop. 1852 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 1853 be = DFS.endRPO(); bb != be; ++bb) 1854 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 1855 1856 // At this point every instruction in the original loop is widened to 1857 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1858 // that we vectorized. The PHI nodes are currently empty because we did 1859 // not want to introduce cycles. Notice that the remaining PHI nodes 1860 // that we need to fix are reduction variables. 1861 1862 // Create the 'reduced' values for each of the induction vars. 1863 // The reduced values are the vector values that we scalarize and combine 1864 // after the loop is finished. 1865 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1866 it != e; ++it) { 1867 PHINode *RdxPhi = *it; 1868 assert(RdxPhi && "Unable to recover vectorized PHI"); 1869 1870 // Find the reduction variable descriptor. 1871 assert(Legal->getReductionVars()->count(RdxPhi) && 1872 "Unable to find the reduction variable"); 1873 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1874 (*Legal->getReductionVars())[RdxPhi]; 1875 1876 // We need to generate a reduction vector from the incoming scalar. 1877 // To do so, we need to generate the 'identity' vector and overide 1878 // one of the elements with the incoming scalar reduction. We need 1879 // to do it in the vector-loop preheader. 1880 Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); 1881 1882 // This is the vector-clone of the value that leaves the loop. 1883 VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1884 Type *VecTy = VectorExit[0]->getType(); 1885 1886 // Find the reduction identity variable. Zero for addition, or, xor, 1887 // one for multiplication, -1 for And. 1888 Value *Identity; 1889 Value *VectorStart; 1890 if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || 1891 RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { 1892 // MinMax reduction have the start value as their identify. 1893 VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, 1894 "minmax.ident"); 1895 } else { 1896 Constant *Iden = 1897 LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, 1898 VecTy->getScalarType()); 1899 Identity = ConstantVector::getSplat(VF, Iden); 1900 1901 // This vector is the Identity vector where the first element is the 1902 // incoming scalar reduction. 1903 VectorStart = Builder.CreateInsertElement(Identity, 1904 RdxDesc.StartValue, Zero); 1905 } 1906 1907 // Fix the vector-loop phi. 1908 // We created the induction variable so we know that the 1909 // preheader is the first entry. 1910 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1911 1912 // Reductions do not have to start at zero. They can start with 1913 // any loop invariant values. 1914 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 1915 BasicBlock *Latch = OrigLoop->getLoopLatch(); 1916 Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 1917 VectorParts &Val = getVectorValue(LoopVal); 1918 for (unsigned part = 0; part < UF; ++part) { 1919 // Make sure to add the reduction stat value only to the 1920 // first unroll part. 1921 Value *StartVal = (part == 0) ? VectorStart : Identity; 1922 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); 1923 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); 1924 } 1925 1926 // Before each round, move the insertion point right between 1927 // the PHIs and the values we are going to write. 1928 // This allows us to write both PHINodes and the extractelement 1929 // instructions. 1930 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1931 1932 VectorParts RdxParts; 1933 for (unsigned part = 0; part < UF; ++part) { 1934 // This PHINode contains the vectorized reduction variable, or 1935 // the initial value vector, if we bypass the vector loop. 1936 VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); 1937 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1938 Value *StartVal = (part == 0) ? VectorStart : Identity; 1939 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1940 NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); 1941 NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); 1942 RdxParts.push_back(NewPhi); 1943 } 1944 1945 // Reduce all of the unrolled parts into a single vector. 1946 Value *ReducedPartRdx = RdxParts[0]; 1947 unsigned Op = getReductionBinOp(RdxDesc.Kind); 1948 for (unsigned part = 1; part < UF; ++part) { 1949 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 1950 ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, 1951 RdxParts[part], ReducedPartRdx, 1952 "bin.rdx"); 1953 else 1954 ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, 1955 ReducedPartRdx, RdxParts[part]); 1956 } 1957 1958 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 1959 // and vector ops, reducing the set of values being computed by half each 1960 // round. 1961 assert(isPowerOf2_32(VF) && 1962 "Reduction emission only supported for pow2 vectors!"); 1963 Value *TmpVec = ReducedPartRdx; 1964 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 1965 for (unsigned i = VF; i != 1; i >>= 1) { 1966 // Move the upper half of the vector to the lower half. 1967 for (unsigned j = 0; j != i/2; ++j) 1968 ShuffleMask[j] = Builder.getInt32(i/2 + j); 1969 1970 // Fill the rest of the mask with undef. 1971 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 1972 UndefValue::get(Builder.getInt32Ty())); 1973 1974 Value *Shuf = 1975 Builder.CreateShuffleVector(TmpVec, 1976 UndefValue::get(TmpVec->getType()), 1977 ConstantVector::get(ShuffleMask), 1978 "rdx.shuf"); 1979 1980 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 1981 TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, 1982 "bin.rdx"); 1983 else 1984 TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); 1985 } 1986 1987 // The result is in the first element of the vector. 1988 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 1989 1990 // Now, we need to fix the users of the reduction variable 1991 // inside and outside of the scalar remainder loop. 1992 // We know that the loop is in LCSSA form. We need to update the 1993 // PHI nodes in the exit blocks. 1994 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1995 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1996 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1997 if (!LCSSAPhi) continue; 1998 1999 // All PHINodes need to have a single entry edge, or two if 2000 // we already fixed them. 2001 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 2002 2003 // We found our reduction value exit-PHI. Update it with the 2004 // incoming bypass edge. 2005 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 2006 // Add an edge coming from the bypass. 2007 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 2008 break; 2009 } 2010 }// end of the LCSSA phi scan. 2011 2012 // Fix the scalar loop reduction variable with the incoming reduction sum 2013 // from the vector body and from the backedge value. 2014 int IncomingEdgeBlockIdx = 2015 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 2016 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 2017 // Pick the other block. 2018 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 2019 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 2020 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 2021 }// end of for each redux variable. 2022 2023 // The Loop exit block may have single value PHI nodes where the incoming 2024 // value is 'undef'. While vectorizing we only handled real values that 2025 // were defined inside the loop. Here we handle the 'undef case'. 2026 // See PR14725. 2027 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 2028 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 2029 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 2030 if (!LCSSAPhi) continue; 2031 if (LCSSAPhi->getNumIncomingValues() == 1) 2032 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 2033 LoopMiddleBlock); 2034 } 2035} 2036 2037InnerLoopVectorizer::VectorParts 2038InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 2039 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 2040 "Invalid edge"); 2041 2042 VectorParts SrcMask = createBlockInMask(Src); 2043 2044 // The terminator has to be a branch inst! 2045 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 2046 assert(BI && "Unexpected terminator found"); 2047 2048 if (BI->isConditional()) { 2049 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 2050 2051 if (BI->getSuccessor(0) != Dst) 2052 for (unsigned part = 0; part < UF; ++part) 2053 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 2054 2055 for (unsigned part = 0; part < UF; ++part) 2056 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 2057 return EdgeMask; 2058 } 2059 2060 return SrcMask; 2061} 2062 2063InnerLoopVectorizer::VectorParts 2064InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 2065 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 2066 2067 // Loop incoming mask is all-one. 2068 if (OrigLoop->getHeader() == BB) { 2069 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 2070 return getVectorValue(C); 2071 } 2072 2073 // This is the block mask. We OR all incoming edges, and with zero. 2074 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 2075 VectorParts BlockMask = getVectorValue(Zero); 2076 2077 // For each pred: 2078 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 2079 VectorParts EM = createEdgeMask(*it, BB); 2080 for (unsigned part = 0; part < UF; ++part) 2081 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 2082 } 2083 2084 return BlockMask; 2085} 2086 2087void 2088InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 2089 BasicBlock *BB, PhiVector *PV) { 2090 // For each instruction in the old loop. 2091 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2092 VectorParts &Entry = WidenMap.get(it); 2093 switch (it->getOpcode()) { 2094 case Instruction::Br: 2095 // Nothing to do for PHIs and BR, since we already took care of the 2096 // loop control flow instructions. 2097 continue; 2098 case Instruction::PHI:{ 2099 PHINode* P = cast<PHINode>(it); 2100 // Handle reduction variables: 2101 if (Legal->getReductionVars()->count(P)) { 2102 for (unsigned part = 0; part < UF; ++part) { 2103 // This is phase one of vectorizing PHIs. 2104 Type *VecTy = VectorType::get(it->getType(), VF); 2105 Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 2106 LoopVectorBody-> getFirstInsertionPt()); 2107 } 2108 PV->push_back(P); 2109 continue; 2110 } 2111 2112 // Check for PHI nodes that are lowered to vector selects. 2113 if (P->getParent() != OrigLoop->getHeader()) { 2114 // We know that all PHIs in non header blocks are converted into 2115 // selects, so we don't have to worry about the insertion order and we 2116 // can just use the builder. 2117 // At this point we generate the predication tree. There may be 2118 // duplications since this is a simple recursive scan, but future 2119 // optimizations will clean it up. 2120 2121 unsigned NumIncoming = P->getNumIncomingValues(); 2122 assert(NumIncoming > 1 && "Invalid PHI"); 2123 2124 // Generate a sequence of selects of the form: 2125 // SELECT(Mask3, In3, 2126 // SELECT(Mask2, In2, 2127 // ( ...))) 2128 for (unsigned In = 0; In < NumIncoming; In++) { 2129 VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 2130 P->getParent()); 2131 VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 2132 2133 for (unsigned part = 0; part < UF; ++part) { 2134 // We don't need to 'select' the first PHI operand because it is 2135 // the default value if all of the other masks don't match. 2136 if (In == 0) 2137 Entry[part] = In0[part]; 2138 else 2139 // Select between the current value and the previous incoming edge 2140 // based on the incoming mask. 2141 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 2142 Entry[part], "predphi"); 2143 } 2144 } 2145 continue; 2146 } 2147 2148 // This PHINode must be an induction variable. 2149 // Make sure that we know about it. 2150 assert(Legal->getInductionVars()->count(P) && 2151 "Not an induction variable"); 2152 2153 LoopVectorizationLegality::InductionInfo II = 2154 Legal->getInductionVars()->lookup(P); 2155 2156 switch (II.IK) { 2157 case LoopVectorizationLegality::IK_NoInduction: 2158 llvm_unreachable("Unknown induction"); 2159 case LoopVectorizationLegality::IK_IntInduction: { 2160 assert(P->getType() == II.StartValue->getType() && "Types must match"); 2161 Type *PhiTy = P->getType(); 2162 Value *Broadcasted; 2163 if (P == OldInduction) { 2164 // Handle the canonical induction variable. We might have had to 2165 // extend the type. 2166 Broadcasted = Builder.CreateTrunc(Induction, PhiTy); 2167 } else { 2168 // Handle other induction variables that are now based on the 2169 // canonical one. 2170 Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, 2171 "normalized.idx"); 2172 NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); 2173 Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, 2174 "offset.idx"); 2175 } 2176 Broadcasted = getBroadcastInstrs(Broadcasted); 2177 // After broadcasting the induction variable we need to make the vector 2178 // consecutive by adding 0, 1, 2, etc. 2179 for (unsigned part = 0; part < UF; ++part) 2180 Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); 2181 continue; 2182 } 2183 case LoopVectorizationLegality::IK_ReverseIntInduction: 2184 case LoopVectorizationLegality::IK_PtrInduction: 2185 case LoopVectorizationLegality::IK_ReversePtrInduction: 2186 // Handle reverse integer and pointer inductions. 2187 Value *StartIdx = ExtendedIdx; 2188 // This is the normalized GEP that starts counting at zero. 2189 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 2190 "normalized.idx"); 2191 2192 // Handle the reverse integer induction variable case. 2193 if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { 2194 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 2195 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 2196 "resize.norm.idx"); 2197 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 2198 "reverse.idx"); 2199 2200 // This is a new value so do not hoist it out. 2201 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 2202 // After broadcasting the induction variable we need to make the 2203 // vector consecutive by adding ... -3, -2, -1, 0. 2204 for (unsigned part = 0; part < UF; ++part) 2205 Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, 2206 true); 2207 continue; 2208 } 2209 2210 // Handle the pointer induction variable case. 2211 assert(P->getType()->isPointerTy() && "Unexpected type."); 2212 2213 // Is this a reverse induction ptr or a consecutive induction ptr. 2214 bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == 2215 II.IK); 2216 2217 // This is the vector of results. Notice that we don't generate 2218 // vector geps because scalar geps result in better code. 2219 for (unsigned part = 0; part < UF; ++part) { 2220 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 2221 for (unsigned int i = 0; i < VF; ++i) { 2222 int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); 2223 Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); 2224 Value *GlobalIdx; 2225 if (!Reverse) 2226 GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 2227 else 2228 GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); 2229 2230 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 2231 "next.gep"); 2232 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 2233 Builder.getInt32(i), 2234 "insert.gep"); 2235 } 2236 Entry[part] = VecVal; 2237 } 2238 continue; 2239 } 2240 2241 }// End of PHI. 2242 2243 case Instruction::Add: 2244 case Instruction::FAdd: 2245 case Instruction::Sub: 2246 case Instruction::FSub: 2247 case Instruction::Mul: 2248 case Instruction::FMul: 2249 case Instruction::UDiv: 2250 case Instruction::SDiv: 2251 case Instruction::FDiv: 2252 case Instruction::URem: 2253 case Instruction::SRem: 2254 case Instruction::FRem: 2255 case Instruction::Shl: 2256 case Instruction::LShr: 2257 case Instruction::AShr: 2258 case Instruction::And: 2259 case Instruction::Or: 2260 case Instruction::Xor: { 2261 // Just widen binops. 2262 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 2263 VectorParts &A = getVectorValue(it->getOperand(0)); 2264 VectorParts &B = getVectorValue(it->getOperand(1)); 2265 2266 // Use this vector value for all users of the original instruction. 2267 for (unsigned Part = 0; Part < UF; ++Part) { 2268 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 2269 2270 // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. 2271 BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V); 2272 if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) { 2273 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 2274 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 2275 } 2276 if (VecOp && isa<PossiblyExactOperator>(VecOp)) 2277 VecOp->setIsExact(BinOp->isExact()); 2278 2279 Entry[Part] = V; 2280 } 2281 break; 2282 } 2283 case Instruction::Select: { 2284 // Widen selects. 2285 // If the selector is loop invariant we can create a select 2286 // instruction with a scalar condition. Otherwise, use vector-select. 2287 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 2288 OrigLoop); 2289 2290 // The condition can be loop invariant but still defined inside the 2291 // loop. This means that we can't just use the original 'cond' value. 2292 // We have to take the 'vectorized' value and pick the first lane. 2293 // Instcombine will make this a no-op. 2294 VectorParts &Cond = getVectorValue(it->getOperand(0)); 2295 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 2296 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 2297 Value *ScalarCond = Builder.CreateExtractElement(Cond[0], 2298 Builder.getInt32(0)); 2299 for (unsigned Part = 0; Part < UF; ++Part) { 2300 Entry[Part] = Builder.CreateSelect( 2301 InvariantCond ? ScalarCond : Cond[Part], 2302 Op0[Part], 2303 Op1[Part]); 2304 } 2305 break; 2306 } 2307 2308 case Instruction::ICmp: 2309 case Instruction::FCmp: { 2310 // Widen compares. Generate vector compares. 2311 bool FCmp = (it->getOpcode() == Instruction::FCmp); 2312 CmpInst *Cmp = dyn_cast<CmpInst>(it); 2313 VectorParts &A = getVectorValue(it->getOperand(0)); 2314 VectorParts &B = getVectorValue(it->getOperand(1)); 2315 for (unsigned Part = 0; Part < UF; ++Part) { 2316 Value *C = 0; 2317 if (FCmp) 2318 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 2319 else 2320 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 2321 Entry[Part] = C; 2322 } 2323 break; 2324 } 2325 2326 case Instruction::Store: 2327 case Instruction::Load: 2328 vectorizeMemoryInstruction(it, Legal); 2329 break; 2330 case Instruction::ZExt: 2331 case Instruction::SExt: 2332 case Instruction::FPToUI: 2333 case Instruction::FPToSI: 2334 case Instruction::FPExt: 2335 case Instruction::PtrToInt: 2336 case Instruction::IntToPtr: 2337 case Instruction::SIToFP: 2338 case Instruction::UIToFP: 2339 case Instruction::Trunc: 2340 case Instruction::FPTrunc: 2341 case Instruction::BitCast: { 2342 CastInst *CI = dyn_cast<CastInst>(it); 2343 /// Optimize the special case where the source is the induction 2344 /// variable. Notice that we can only optimize the 'trunc' case 2345 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 2346 /// c. other casts depend on pointer size. 2347 if (CI->getOperand(0) == OldInduction && 2348 it->getOpcode() == Instruction::Trunc) { 2349 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 2350 CI->getType()); 2351 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 2352 for (unsigned Part = 0; Part < UF; ++Part) 2353 Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); 2354 break; 2355 } 2356 /// Vectorize casts. 2357 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 2358 2359 VectorParts &A = getVectorValue(it->getOperand(0)); 2360 for (unsigned Part = 0; Part < UF; ++Part) 2361 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 2362 break; 2363 } 2364 2365 case Instruction::Call: { 2366 // Ignore dbg intrinsics. 2367 if (isa<DbgInfoIntrinsic>(it)) 2368 break; 2369 2370 Module *M = BB->getParent()->getParent(); 2371 CallInst *CI = cast<CallInst>(it); 2372 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2373 assert(ID && "Not an intrinsic call!"); 2374 for (unsigned Part = 0; Part < UF; ++Part) { 2375 SmallVector<Value*, 4> Args; 2376 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 2377 VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); 2378 Args.push_back(Arg[Part]); 2379 } 2380 Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; 2381 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 2382 Entry[Part] = Builder.CreateCall(F, Args); 2383 } 2384 break; 2385 } 2386 2387 default: 2388 // All other instructions are unsupported. Scalarize them. 2389 scalarizeInstruction(it); 2390 break; 2391 }// end of switch. 2392 }// end of for_each instr. 2393} 2394 2395void InnerLoopVectorizer::updateAnalysis() { 2396 // Forget the original basic block. 2397 SE->forgetLoop(OrigLoop); 2398 2399 // Update the dominator tree information. 2400 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 2401 "Entry does not dominate exit."); 2402 2403 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2404 DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 2405 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 2406 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 2407 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); 2408 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 2409 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 2410 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2411 2412 DEBUG(DT->verifyAnalysis()); 2413} 2414 2415bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 2416 if (!EnableIfConversion) 2417 return false; 2418 2419 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 2420 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 2421 2422 // Collect the blocks that need predication. 2423 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 2424 BasicBlock *BB = LoopBlocks[i]; 2425 2426 // We don't support switch statements inside loops. 2427 if (!isa<BranchInst>(BB->getTerminator())) 2428 return false; 2429 2430 // We must be able to predicate all blocks that need to be predicated. 2431 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 2432 return false; 2433 } 2434 2435 // Check that we can actually speculate the hoistable loads. 2436 if (!LoadSpeculation.canHoistAllLoads()) 2437 return false; 2438 2439 // We can if-convert this loop. 2440 return true; 2441} 2442 2443bool LoopVectorizationLegality::canVectorize() { 2444 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 2445 2446 // We can only vectorize innermost loops. 2447 if (TheLoop->getSubLoopsVector().size()) 2448 return false; 2449 2450 // We must have a single backedge. 2451 if (TheLoop->getNumBackEdges() != 1) 2452 return false; 2453 2454 // We must have a single exiting block. 2455 if (!TheLoop->getExitingBlock()) 2456 return false; 2457 2458 unsigned NumBlocks = TheLoop->getNumBlocks(); 2459 2460 // Check if we can if-convert non single-bb loops. 2461 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 2462 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 2463 return false; 2464 } 2465 2466 // We need to have a loop header. 2467 BasicBlock *Latch = TheLoop->getLoopLatch(); 2468 DEBUG(dbgs() << "LV: Found a loop: " << 2469 TheLoop->getHeader()->getName() << "\n"); 2470 2471 // ScalarEvolution needs to be able to find the exit count. 2472 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 2473 if (ExitCount == SE->getCouldNotCompute()) { 2474 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 2475 return false; 2476 } 2477 2478 // Do not loop-vectorize loops with a tiny trip count. 2479 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 2480 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 2481 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 2482 "This loop is not worth vectorizing.\n"); 2483 return false; 2484 } 2485 2486 // Check if we can vectorize the instructions and CFG in this loop. 2487 if (!canVectorizeInstrs()) { 2488 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 2489 return false; 2490 } 2491 2492 // Go over each instruction and look at memory deps. 2493 if (!canVectorizeMemory()) { 2494 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 2495 return false; 2496 } 2497 2498 // Collect all of the variables that remain uniform after vectorization. 2499 collectLoopUniforms(); 2500 2501 DEBUG(dbgs() << "LV: We can vectorize this loop" << 2502 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 2503 <<"!\n"); 2504 2505 // Okay! We can vectorize. At this point we don't have any other mem analysis 2506 // which may limit our maximum vectorization factor, so just return true with 2507 // no restrictions. 2508 return true; 2509} 2510 2511static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { 2512 if (Ty->isPointerTy()) 2513 return DL.getIntPtrType(Ty->getContext()); 2514 return Ty; 2515} 2516 2517static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { 2518 Ty0 = convertPointerToIntegerType(DL, Ty0); 2519 Ty1 = convertPointerToIntegerType(DL, Ty1); 2520 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 2521 return Ty0; 2522 return Ty1; 2523} 2524 2525bool LoopVectorizationLegality::canVectorizeInstrs() { 2526 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 2527 BasicBlock *Header = TheLoop->getHeader(); 2528 2529 // If we marked the scalar loop as "already vectorized" then no need 2530 // to vectorize it again. 2531 if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) { 2532 DEBUG(dbgs() << "LV: This loop was vectorized before\n"); 2533 return false; 2534 } 2535 2536 // Look for the attribute signaling the absence of NaNs. 2537 Function &F = *Header->getParent(); 2538 if (F.hasFnAttribute("no-nans-fp-math")) 2539 HasFunNoNaNAttr = F.getAttributes().getAttribute( 2540 AttributeSet::FunctionIndex, 2541 "no-nans-fp-math").getValueAsString() == "true"; 2542 2543 // For each block in the loop. 2544 for (Loop::block_iterator bb = TheLoop->block_begin(), 2545 be = TheLoop->block_end(); bb != be; ++bb) { 2546 2547 // Scan the instructions in the block and look for hazards. 2548 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2549 ++it) { 2550 2551 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 2552 Type *PhiTy = Phi->getType(); 2553 // Check that this PHI type is allowed. 2554 if (!PhiTy->isIntegerTy() && 2555 !PhiTy->isFloatingPointTy() && 2556 !PhiTy->isPointerTy()) { 2557 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 2558 return false; 2559 } 2560 2561 // If this PHINode is not in the header block, then we know that we 2562 // can convert it to select during if-conversion. No need to check if 2563 // the PHIs in this block are induction or reduction variables. 2564 if (*bb != Header) 2565 continue; 2566 2567 // We only allow if-converted PHIs with more than two incoming values. 2568 if (Phi->getNumIncomingValues() != 2) { 2569 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 2570 return false; 2571 } 2572 2573 // This is the value coming from the preheader. 2574 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 2575 // Check if this is an induction variable. 2576 InductionKind IK = isInductionVariable(Phi); 2577 2578 if (IK_NoInduction != IK) { 2579 // Get the widest type. 2580 if (!WidestIndTy) 2581 WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); 2582 else 2583 WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); 2584 2585 // Int inductions are special because we only allow one IV. 2586 if (IK == IK_IntInduction) { 2587 // Use the phi node with the widest type as induction. Use the last 2588 // one if there are multiple (no good reason for doing this other 2589 // than it is expedient). 2590 if (!Induction || PhiTy == WidestIndTy) 2591 Induction = Phi; 2592 } 2593 2594 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 2595 Inductions[Phi] = InductionInfo(StartValue, IK); 2596 continue; 2597 } 2598 2599 if (AddReductionVar(Phi, RK_IntegerAdd)) { 2600 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 2601 continue; 2602 } 2603 if (AddReductionVar(Phi, RK_IntegerMult)) { 2604 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 2605 continue; 2606 } 2607 if (AddReductionVar(Phi, RK_IntegerOr)) { 2608 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 2609 continue; 2610 } 2611 if (AddReductionVar(Phi, RK_IntegerAnd)) { 2612 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 2613 continue; 2614 } 2615 if (AddReductionVar(Phi, RK_IntegerXor)) { 2616 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 2617 continue; 2618 } 2619 if (AddReductionVar(Phi, RK_IntegerMinMax)) { 2620 DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); 2621 continue; 2622 } 2623 if (AddReductionVar(Phi, RK_FloatMult)) { 2624 DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); 2625 continue; 2626 } 2627 if (AddReductionVar(Phi, RK_FloatAdd)) { 2628 DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); 2629 continue; 2630 } 2631 if (AddReductionVar(Phi, RK_FloatMinMax)) { 2632 DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); 2633 continue; 2634 } 2635 2636 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 2637 return false; 2638 }// end of PHI handling 2639 2640 // We still don't handle functions. However, we can ignore dbg intrinsic 2641 // calls and we do handle certain intrinsic and libm functions. 2642 CallInst *CI = dyn_cast<CallInst>(it); 2643 if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { 2644 DEBUG(dbgs() << "LV: Found a call site.\n"); 2645 return false; 2646 } 2647 2648 // Check that the instruction return type is vectorizable. 2649 if (!VectorType::isValidElementType(it->getType()) && 2650 !it->getType()->isVoidTy()) { 2651 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 2652 return false; 2653 } 2654 2655 // Check that the stored type is vectorizable. 2656 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 2657 Type *T = ST->getValueOperand()->getType(); 2658 if (!VectorType::isValidElementType(T)) 2659 return false; 2660 } 2661 2662 // Reduction instructions are allowed to have exit users. 2663 // All other instructions must not have external users. 2664 if (!AllowedExit.count(it)) 2665 //Check that all of the users of the loop are inside the BB. 2666 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 2667 I != E; ++I) { 2668 Instruction *U = cast<Instruction>(*I); 2669 // This user may be a reduction exit value. 2670 if (!TheLoop->contains(U)) { 2671 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 2672 return false; 2673 } 2674 } 2675 } // next instr. 2676 2677 } 2678 2679 if (!Induction) { 2680 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 2681 if (Inductions.empty()) 2682 return false; 2683 } 2684 2685 return true; 2686} 2687 2688void LoopVectorizationLegality::collectLoopUniforms() { 2689 // We now know that the loop is vectorizable! 2690 // Collect variables that will remain uniform after vectorization. 2691 std::vector<Value*> Worklist; 2692 BasicBlock *Latch = TheLoop->getLoopLatch(); 2693 2694 // Start with the conditional branch and walk up the block. 2695 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 2696 2697 while (Worklist.size()) { 2698 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 2699 Worklist.pop_back(); 2700 2701 // Look at instructions inside this loop. 2702 // Stop when reaching PHI nodes. 2703 // TODO: we need to follow values all over the loop, not only in this block. 2704 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 2705 continue; 2706 2707 // This is a known uniform. 2708 Uniforms.insert(I); 2709 2710 // Insert all operands. 2711 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 2712 Worklist.push_back(I->getOperand(i)); 2713 } 2714 } 2715} 2716 2717AliasAnalysis::Location 2718LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { 2719 if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) 2720 return AA->getLocation(Store); 2721 else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) 2722 return AA->getLocation(Load); 2723 2724 llvm_unreachable("Should be either load or store instruction"); 2725} 2726 2727bool 2728LoopVectorizationLegality::hasPossibleGlobalWriteReorder( 2729 Value *Object, 2730 Instruction *Inst, 2731 AliasMultiMap& WriteObjects, 2732 unsigned MaxByteWidth) { 2733 2734 AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); 2735 2736 std::vector<Instruction*>::iterator 2737 it = WriteObjects[Object].begin(), 2738 end = WriteObjects[Object].end(); 2739 2740 for (; it != end; ++it) { 2741 Instruction* I = *it; 2742 if (I == Inst) 2743 continue; 2744 2745 AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); 2746 if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), 2747 ThatLoc.getWithNewSize(MaxByteWidth))) 2748 return true; 2749 } 2750 return false; 2751} 2752 2753bool LoopVectorizationLegality::canVectorizeMemory() { 2754 2755 typedef SmallVector<Value*, 16> ValueVector; 2756 typedef SmallPtrSet<Value*, 16> ValueSet; 2757 // Holds the Load and Store *instructions*. 2758 ValueVector Loads; 2759 ValueVector Stores; 2760 PtrRtCheck.Pointers.clear(); 2761 PtrRtCheck.Need = false; 2762 2763 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 2764 2765 // For each block. 2766 for (Loop::block_iterator bb = TheLoop->block_begin(), 2767 be = TheLoop->block_end(); bb != be; ++bb) { 2768 2769 // Scan the BB and collect legal loads and stores. 2770 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2771 ++it) { 2772 2773 // If this is a load, save it. If this instruction can read from memory 2774 // but is not a load, then we quit. Notice that we don't handle function 2775 // calls that read or write. 2776 if (it->mayReadFromMemory()) { 2777 LoadInst *Ld = dyn_cast<LoadInst>(it); 2778 if (!Ld) return false; 2779 if (!Ld->isSimple() && !IsAnnotatedParallel) { 2780 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 2781 return false; 2782 } 2783 Loads.push_back(Ld); 2784 continue; 2785 } 2786 2787 // Save 'store' instructions. Abort if other instructions write to memory. 2788 if (it->mayWriteToMemory()) { 2789 StoreInst *St = dyn_cast<StoreInst>(it); 2790 if (!St) return false; 2791 if (!St->isSimple() && !IsAnnotatedParallel) { 2792 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 2793 return false; 2794 } 2795 Stores.push_back(St); 2796 } 2797 } // next instr. 2798 } // next block. 2799 2800 // Now we have two lists that hold the loads and the stores. 2801 // Next, we find the pointers that they use. 2802 2803 // Check if we see any stores. If there are no stores, then we don't 2804 // care if the pointers are *restrict*. 2805 if (!Stores.size()) { 2806 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 2807 return true; 2808 } 2809 2810 // Holds the read and read-write *pointers* that we find. These maps hold 2811 // unique values for pointers (so no need for multi-map). 2812 AliasMap Reads; 2813 AliasMap ReadWrites; 2814 2815 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 2816 // multiple times on the same object. If the ptr is accessed twice, once 2817 // for read and once for write, it will only appear once (on the write 2818 // list). This is okay, since we are going to check for conflicts between 2819 // writes and between reads and writes, but not between reads and reads. 2820 ValueSet Seen; 2821 2822 ValueVector::iterator I, IE; 2823 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 2824 StoreInst *ST = cast<StoreInst>(*I); 2825 Value* Ptr = ST->getPointerOperand(); 2826 2827 if (isUniform(Ptr)) { 2828 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 2829 return false; 2830 } 2831 2832 // If we did *not* see this pointer before, insert it to 2833 // the read-write list. At this phase it is only a 'write' list. 2834 if (Seen.insert(Ptr)) 2835 ReadWrites.insert(std::make_pair(Ptr, ST)); 2836 } 2837 2838 if (IsAnnotatedParallel) { 2839 DEBUG(dbgs() 2840 << "LV: A loop annotated parallel, ignore memory dependency " 2841 << "checks.\n"); 2842 return true; 2843 } 2844 2845 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 2846 LoadInst *LD = cast<LoadInst>(*I); 2847 Value* Ptr = LD->getPointerOperand(); 2848 // If we did *not* see this pointer before, insert it to the 2849 // read list. If we *did* see it before, then it is already in 2850 // the read-write list. This allows us to vectorize expressions 2851 // such as A[i] += x; Because the address of A[i] is a read-write 2852 // pointer. This only works if the index of A[i] is consecutive. 2853 // If the address of i is unknown (for example A[B[i]]) then we may 2854 // read a few words, modify, and write a few words, and some of the 2855 // words may be written to the same address. 2856 if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) 2857 Reads.insert(std::make_pair(Ptr, LD)); 2858 } 2859 2860 // If we write (or read-write) to a single destination and there are no 2861 // other reads in this loop then is it safe to vectorize. 2862 if (ReadWrites.size() == 1 && Reads.size() == 0) { 2863 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 2864 return true; 2865 } 2866 2867 unsigned NumReadPtrs = 0; 2868 unsigned NumWritePtrs = 0; 2869 2870 // Find pointers with computable bounds. We are going to use this information 2871 // to place a runtime bound check. 2872 bool CanDoRT = true; 2873 AliasMap::iterator MI, ME; 2874 for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { 2875 Value *V = (*MI).first; 2876 if (hasComputableBounds(V)) { 2877 PtrRtCheck.insert(SE, TheLoop, V, true); 2878 NumWritePtrs++; 2879 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); 2880 } else { 2881 CanDoRT = false; 2882 break; 2883 } 2884 } 2885 for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { 2886 Value *V = (*MI).first; 2887 if (hasComputableBounds(V)) { 2888 PtrRtCheck.insert(SE, TheLoop, V, false); 2889 NumReadPtrs++; 2890 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); 2891 } else { 2892 CanDoRT = false; 2893 break; 2894 } 2895 } 2896 2897 // Check that we did not collect too many pointers or found a 2898 // unsizeable pointer. 2899 unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); 2900 DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); 2901 if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { 2902 PtrRtCheck.reset(); 2903 CanDoRT = false; 2904 } 2905 2906 if (CanDoRT) { 2907 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 2908 } 2909 2910 bool NeedRTCheck = false; 2911 2912 // Biggest vectorized access possible, vector width * unroll factor. 2913 // TODO: We're being very pessimistic here, find a way to know the 2914 // real access width before getting here. 2915 unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * 2916 TTI->getMaximumUnrollFactor(); 2917 // Now that the pointers are in two lists (Reads and ReadWrites), we 2918 // can check that there are no conflicts between each of the writes and 2919 // between the writes to the reads. 2920 // Note that WriteObjects duplicates the stores (indexed now by underlying 2921 // objects) to avoid pointing to elements inside ReadWrites. 2922 // TODO: Maybe create a new type where they can interact without duplication. 2923 AliasMultiMap WriteObjects; 2924 ValueVector TempObjects; 2925 2926 // Check that the read-writes do not conflict with other read-write 2927 // pointers. 2928 bool AllWritesIdentified = true; 2929 for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { 2930 Value *Val = (*MI).first; 2931 Instruction *Inst = (*MI).second; 2932 2933 GetUnderlyingObjects(Val, TempObjects, DL); 2934 for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); 2935 UI != UE; ++UI) { 2936 if (!isIdentifiedObject(*UI)) { 2937 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); 2938 NeedRTCheck = true; 2939 AllWritesIdentified = false; 2940 } 2941 2942 // Never seen it before, can't alias. 2943 if (WriteObjects[*UI].empty()) { 2944 DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); 2945 WriteObjects[*UI].push_back(Inst); 2946 continue; 2947 } 2948 // Direct alias found. 2949 if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { 2950 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 2951 << **UI <<"\n"); 2952 return false; 2953 } 2954 DEBUG(dbgs() << "LV: Found a conflicting global value:" 2955 << **UI <<"\n"); 2956 DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); 2957 DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); 2958 2959 // If global alias, make sure they do alias. 2960 if (hasPossibleGlobalWriteReorder(*UI, 2961 Inst, 2962 WriteObjects, 2963 MaxByteWidth)) { 2964 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI 2965 << "\n"); 2966 return false; 2967 } 2968 2969 // Didn't alias, insert into map for further reference. 2970 WriteObjects[*UI].push_back(Inst); 2971 } 2972 TempObjects.clear(); 2973 } 2974 2975 /// Check that the reads don't conflict with the read-writes. 2976 for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { 2977 Value *Val = (*MI).first; 2978 GetUnderlyingObjects(Val, TempObjects, DL); 2979 for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); 2980 UI != UE; ++UI) { 2981 // If all of the writes are identified then we don't care if the read 2982 // pointer is identified or not. 2983 if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { 2984 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); 2985 NeedRTCheck = true; 2986 } 2987 2988 // Never seen it before, can't alias. 2989 if (WriteObjects[*UI].empty()) 2990 continue; 2991 // Direct alias found. 2992 if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { 2993 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 2994 << **UI <<"\n"); 2995 return false; 2996 } 2997 DEBUG(dbgs() << "LV: Found a global value: " 2998 << **UI <<"\n"); 2999 Instruction *Inst = (*MI).second; 3000 DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); 3001 DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); 3002 3003 // If global alias, make sure they do alias. 3004 if (hasPossibleGlobalWriteReorder(*UI, 3005 Inst, 3006 WriteObjects, 3007 MaxByteWidth)) { 3008 DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI 3009 << "\n"); 3010 return false; 3011 } 3012 } 3013 TempObjects.clear(); 3014 } 3015 3016 PtrRtCheck.Need = NeedRTCheck; 3017 if (NeedRTCheck && !CanDoRT) { 3018 DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 3019 "the array bounds.\n"); 3020 PtrRtCheck.reset(); 3021 return false; 3022 } 3023 3024 DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << 3025 " need a runtime memory check.\n"); 3026 return true; 3027} 3028 3029static bool hasMultipleUsesOf(Instruction *I, 3030 SmallPtrSet<Instruction *, 8> &Insts) { 3031 unsigned NumUses = 0; 3032 for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { 3033 if (Insts.count(dyn_cast<Instruction>(*Use))) 3034 ++NumUses; 3035 if (NumUses > 1) 3036 return true; 3037 } 3038 3039 return false; 3040} 3041 3042static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) { 3043 for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) 3044 if (!Set.count(dyn_cast<Instruction>(*Use))) 3045 return false; 3046 return true; 3047} 3048 3049bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 3050 ReductionKind Kind) { 3051 if (Phi->getNumIncomingValues() != 2) 3052 return false; 3053 3054 // Reduction variables are only found in the loop header block. 3055 if (Phi->getParent() != TheLoop->getHeader()) 3056 return false; 3057 3058 // Obtain the reduction start value from the value that comes from the loop 3059 // preheader. 3060 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 3061 3062 // ExitInstruction is the single value which is used outside the loop. 3063 // We only allow for a single reduction value to be used outside the loop. 3064 // This includes users of the reduction, variables (which form a cycle 3065 // which ends in the phi node). 3066 Instruction *ExitInstruction = 0; 3067 // Indicates that we found a reduction operation in our scan. 3068 bool FoundReduxOp = false; 3069 3070 // We start with the PHI node and scan for all of the users of this 3071 // instruction. All users must be instructions that can be used as reduction 3072 // variables (such as ADD). We must have a single out-of-block user. The cycle 3073 // must include the original PHI. 3074 bool FoundStartPHI = false; 3075 3076 // To recognize min/max patterns formed by a icmp select sequence, we store 3077 // the number of instruction we saw from the recognized min/max pattern, 3078 // to make sure we only see exactly the two instructions. 3079 unsigned NumCmpSelectPatternInst = 0; 3080 ReductionInstDesc ReduxDesc(false, 0); 3081 3082 SmallPtrSet<Instruction *, 8> VisitedInsts; 3083 SmallVector<Instruction *, 8> Worklist; 3084 Worklist.push_back(Phi); 3085 VisitedInsts.insert(Phi); 3086 3087 // A value in the reduction can be used: 3088 // - By the reduction: 3089 // - Reduction operation: 3090 // - One use of reduction value (safe). 3091 // - Multiple use of reduction value (not safe). 3092 // - PHI: 3093 // - All uses of the PHI must be the reduction (safe). 3094 // - Otherwise, not safe. 3095 // - By one instruction outside of the loop (safe). 3096 // - By further instructions outside of the loop (not safe). 3097 // - By an instruction that is not part of the reduction (not safe). 3098 // This is either: 3099 // * An instruction type other than PHI or the reduction operation. 3100 // * A PHI in the header other than the initial PHI. 3101 while (!Worklist.empty()) { 3102 Instruction *Cur = Worklist.back(); 3103 Worklist.pop_back(); 3104 3105 // No Users. 3106 // If the instruction has no users then this is a broken chain and can't be 3107 // a reduction variable. 3108 if (Cur->use_empty()) 3109 return false; 3110 3111 bool IsAPhi = isa<PHINode>(Cur); 3112 3113 // A header PHI use other than the original PHI. 3114 if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) 3115 return false; 3116 3117 // Reductions of instructions such as Div, and Sub is only possible if the 3118 // LHS is the reduction variable. 3119 if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && 3120 !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && 3121 !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) 3122 return false; 3123 3124 // Any reduction instruction must be of one of the allowed kinds. 3125 ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); 3126 if (!ReduxDesc.IsReduction) 3127 return false; 3128 3129 // A reduction operation must only have one use of the reduction value. 3130 if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && 3131 hasMultipleUsesOf(Cur, VisitedInsts)) 3132 return false; 3133 3134 // All inputs to a PHI node must be a reduction value. 3135 if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) 3136 return false; 3137 3138 if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || 3139 isa<SelectInst>(Cur))) 3140 ++NumCmpSelectPatternInst; 3141 if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || 3142 isa<SelectInst>(Cur))) 3143 ++NumCmpSelectPatternInst; 3144 3145 // Check whether we found a reduction operator. 3146 FoundReduxOp |= !IsAPhi; 3147 3148 // Process users of current instruction. Push non PHI nodes after PHI nodes 3149 // onto the stack. This way we are going to have seen all inputs to PHI 3150 // nodes once we get to them. 3151 SmallVector<Instruction *, 8> NonPHIs; 3152 SmallVector<Instruction *, 8> PHIs; 3153 for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; 3154 ++UI) { 3155 Instruction *Usr = cast<Instruction>(*UI); 3156 3157 // Check if we found the exit user. 3158 BasicBlock *Parent = Usr->getParent(); 3159 if (!TheLoop->contains(Parent)) { 3160 // Exit if you find multiple outside users. 3161 if (ExitInstruction != 0) 3162 return false; 3163 ExitInstruction = Cur; 3164 continue; 3165 } 3166 3167 // Process instructions only once (termination). 3168 if (VisitedInsts.insert(Usr)) { 3169 if (isa<PHINode>(Usr)) 3170 PHIs.push_back(Usr); 3171 else 3172 NonPHIs.push_back(Usr); 3173 } 3174 // Remember that we completed the cycle. 3175 if (Usr == Phi) 3176 FoundStartPHI = true; 3177 } 3178 Worklist.append(PHIs.begin(), PHIs.end()); 3179 Worklist.append(NonPHIs.begin(), NonPHIs.end()); 3180 } 3181 3182 // This means we have seen one but not the other instruction of the 3183 // pattern or more than just a select and cmp. 3184 if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && 3185 NumCmpSelectPatternInst != 2) 3186 return false; 3187 3188 if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) 3189 return false; 3190 3191 // We found a reduction var if we have reached the original phi node and we 3192 // only have a single instruction with out-of-loop users. 3193 3194 // This instruction is allowed to have out-of-loop users. 3195 AllowedExit.insert(ExitInstruction); 3196 3197 // Save the description of this reduction variable. 3198 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, 3199 ReduxDesc.MinMaxKind); 3200 Reductions[Phi] = RD; 3201 // We've ended the cycle. This is a reduction variable if we have an 3202 // outside user and it has a binary op. 3203 3204 return true; 3205} 3206 3207/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 3208/// pattern corresponding to a min(X, Y) or max(X, Y). 3209LoopVectorizationLegality::ReductionInstDesc 3210LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, 3211 ReductionInstDesc &Prev) { 3212 3213 assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && 3214 "Expect a select instruction"); 3215 Instruction *Cmp = 0; 3216 SelectInst *Select = 0; 3217 3218 // We must handle the select(cmp()) as a single instruction. Advance to the 3219 // select. 3220 if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { 3221 if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) 3222 return ReductionInstDesc(false, I); 3223 return ReductionInstDesc(Select, Prev.MinMaxKind); 3224 } 3225 3226 // Only handle single use cases for now. 3227 if (!(Select = dyn_cast<SelectInst>(I))) 3228 return ReductionInstDesc(false, I); 3229 if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && 3230 !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) 3231 return ReductionInstDesc(false, I); 3232 if (!Cmp->hasOneUse()) 3233 return ReductionInstDesc(false, I); 3234 3235 Value *CmpLeft; 3236 Value *CmpRight; 3237 3238 // Look for a min/max pattern. 3239 if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3240 return ReductionInstDesc(Select, MRK_UIntMin); 3241 else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3242 return ReductionInstDesc(Select, MRK_UIntMax); 3243 else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3244 return ReductionInstDesc(Select, MRK_SIntMax); 3245 else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3246 return ReductionInstDesc(Select, MRK_SIntMin); 3247 else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3248 return ReductionInstDesc(Select, MRK_FloatMin); 3249 else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3250 return ReductionInstDesc(Select, MRK_FloatMax); 3251 else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3252 return ReductionInstDesc(Select, MRK_FloatMin); 3253 else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3254 return ReductionInstDesc(Select, MRK_FloatMax); 3255 3256 return ReductionInstDesc(false, I); 3257} 3258 3259LoopVectorizationLegality::ReductionInstDesc 3260LoopVectorizationLegality::isReductionInstr(Instruction *I, 3261 ReductionKind Kind, 3262 ReductionInstDesc &Prev) { 3263 bool FP = I->getType()->isFloatingPointTy(); 3264 bool FastMath = (FP && I->isCommutative() && I->isAssociative()); 3265 switch (I->getOpcode()) { 3266 default: 3267 return ReductionInstDesc(false, I); 3268 case Instruction::PHI: 3269 if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && 3270 Kind != RK_FloatMinMax)) 3271 return ReductionInstDesc(false, I); 3272 return ReductionInstDesc(I, Prev.MinMaxKind); 3273 case Instruction::Sub: 3274 case Instruction::Add: 3275 return ReductionInstDesc(Kind == RK_IntegerAdd, I); 3276 case Instruction::Mul: 3277 return ReductionInstDesc(Kind == RK_IntegerMult, I); 3278 case Instruction::And: 3279 return ReductionInstDesc(Kind == RK_IntegerAnd, I); 3280 case Instruction::Or: 3281 return ReductionInstDesc(Kind == RK_IntegerOr, I); 3282 case Instruction::Xor: 3283 return ReductionInstDesc(Kind == RK_IntegerXor, I); 3284 case Instruction::FMul: 3285 return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); 3286 case Instruction::FAdd: 3287 return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); 3288 case Instruction::FCmp: 3289 case Instruction::ICmp: 3290 case Instruction::Select: 3291 if (Kind != RK_IntegerMinMax && 3292 (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) 3293 return ReductionInstDesc(false, I); 3294 return isMinMaxSelectCmpPattern(I, Prev); 3295 } 3296} 3297 3298LoopVectorizationLegality::InductionKind 3299LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 3300 Type *PhiTy = Phi->getType(); 3301 // We only handle integer and pointer inductions variables. 3302 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 3303 return IK_NoInduction; 3304 3305 // Check that the PHI is consecutive. 3306 const SCEV *PhiScev = SE->getSCEV(Phi); 3307 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 3308 if (!AR) { 3309 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 3310 return IK_NoInduction; 3311 } 3312 const SCEV *Step = AR->getStepRecurrence(*SE); 3313 3314 // Integer inductions need to have a stride of one. 3315 if (PhiTy->isIntegerTy()) { 3316 if (Step->isOne()) 3317 return IK_IntInduction; 3318 if (Step->isAllOnesValue()) 3319 return IK_ReverseIntInduction; 3320 return IK_NoInduction; 3321 } 3322 3323 // Calculate the pointer stride and check if it is consecutive. 3324 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 3325 if (!C) 3326 return IK_NoInduction; 3327 3328 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 3329 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 3330 if (C->getValue()->equalsInt(Size)) 3331 return IK_PtrInduction; 3332 else if (C->getValue()->equalsInt(0 - Size)) 3333 return IK_ReversePtrInduction; 3334 3335 return IK_NoInduction; 3336} 3337 3338bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 3339 Value *In0 = const_cast<Value*>(V); 3340 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 3341 if (!PN) 3342 return false; 3343 3344 return Inductions.count(PN); 3345} 3346 3347bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 3348 assert(TheLoop->contains(BB) && "Unknown block used"); 3349 3350 // Blocks that do not dominate the latch need predication. 3351 BasicBlock* Latch = TheLoop->getLoopLatch(); 3352 return !DT->dominates(BB, Latch); 3353} 3354 3355bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 3356 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3357 // We might be able to hoist the load. 3358 if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it)) 3359 return false; 3360 3361 // We don't predicate stores at the moment. 3362 if (it->mayWriteToMemory() || it->mayThrow()) 3363 return false; 3364 3365 // The instructions below can trap. 3366 switch (it->getOpcode()) { 3367 default: continue; 3368 case Instruction::UDiv: 3369 case Instruction::SDiv: 3370 case Instruction::URem: 3371 case Instruction::SRem: 3372 return false; 3373 } 3374 } 3375 3376 return true; 3377} 3378 3379bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 3380 const SCEV *PhiScev = SE->getSCEV(Ptr); 3381 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 3382 if (!AR) 3383 return false; 3384 3385 return AR->isAffine(); 3386} 3387 3388LoopVectorizationCostModel::VectorizationFactor 3389LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 3390 unsigned UserVF) { 3391 // Width 1 means no vectorize 3392 VectorizationFactor Factor = { 1U, 0U }; 3393 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 3394 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 3395 return Factor; 3396 } 3397 3398 // Find the trip count. 3399 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 3400 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 3401 3402 unsigned WidestType = getWidestType(); 3403 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 3404 unsigned MaxVectorSize = WidestRegister / WidestType; 3405 DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); 3406 DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); 3407 3408 if (MaxVectorSize == 0) { 3409 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 3410 MaxVectorSize = 1; 3411 } 3412 3413 assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" 3414 " into one vector!"); 3415 3416 unsigned VF = MaxVectorSize; 3417 3418 // If we optimize the program for size, avoid creating the tail loop. 3419 if (OptForSize) { 3420 // If we are unable to calculate the trip count then don't try to vectorize. 3421 if (TC < 2) { 3422 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 3423 return Factor; 3424 } 3425 3426 // Find the maximum SIMD width that can fit within the trip count. 3427 VF = TC % MaxVectorSize; 3428 3429 if (VF == 0) 3430 VF = MaxVectorSize; 3431 3432 // If the trip count that we found modulo the vectorization factor is not 3433 // zero then we require a tail. 3434 if (VF < 2) { 3435 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 3436 return Factor; 3437 } 3438 } 3439 3440 if (UserVF != 0) { 3441 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 3442 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 3443 3444 Factor.Width = UserVF; 3445 return Factor; 3446 } 3447 3448 float Cost = expectedCost(1); 3449 unsigned Width = 1; 3450 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 3451 for (unsigned i=2; i <= VF; i*=2) { 3452 // Notice that the vector loop needs to be executed less times, so 3453 // we need to divide the cost of the vector loops by the width of 3454 // the vector elements. 3455 float VectorCost = expectedCost(i) / (float)i; 3456 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 3457 (int)VectorCost << ".\n"); 3458 if (VectorCost < Cost) { 3459 Cost = VectorCost; 3460 Width = i; 3461 } 3462 } 3463 3464 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 3465 Factor.Width = Width; 3466 Factor.Cost = Width * Cost; 3467 return Factor; 3468} 3469 3470unsigned LoopVectorizationCostModel::getWidestType() { 3471 unsigned MaxWidth = 8; 3472 3473 // For each block. 3474 for (Loop::block_iterator bb = TheLoop->block_begin(), 3475 be = TheLoop->block_end(); bb != be; ++bb) { 3476 BasicBlock *BB = *bb; 3477 3478 // For each instruction in the loop. 3479 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3480 Type *T = it->getType(); 3481 3482 // Only examine Loads, Stores and PHINodes. 3483 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 3484 continue; 3485 3486 // Examine PHI nodes that are reduction variables. 3487 if (PHINode *PN = dyn_cast<PHINode>(it)) 3488 if (!Legal->getReductionVars()->count(PN)) 3489 continue; 3490 3491 // Examine the stored values. 3492 if (StoreInst *ST = dyn_cast<StoreInst>(it)) 3493 T = ST->getValueOperand()->getType(); 3494 3495 // Ignore loaded pointer types and stored pointer types that are not 3496 // consecutive. However, we do want to take consecutive stores/loads of 3497 // pointer vectors into account. 3498 if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) 3499 continue; 3500 3501 MaxWidth = std::max(MaxWidth, 3502 (unsigned)DL->getTypeSizeInBits(T->getScalarType())); 3503 } 3504 } 3505 3506 return MaxWidth; 3507} 3508 3509unsigned 3510LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, 3511 unsigned UserUF, 3512 unsigned VF, 3513 unsigned LoopCost) { 3514 3515 // -- The unroll heuristics -- 3516 // We unroll the loop in order to expose ILP and reduce the loop overhead. 3517 // There are many micro-architectural considerations that we can't predict 3518 // at this level. For example frontend pressure (on decode or fetch) due to 3519 // code size, or the number and capabilities of the execution ports. 3520 // 3521 // We use the following heuristics to select the unroll factor: 3522 // 1. If the code has reductions the we unroll in order to break the cross 3523 // iteration dependency. 3524 // 2. If the loop is really small then we unroll in order to reduce the loop 3525 // overhead. 3526 // 3. We don't unroll if we think that we will spill registers to memory due 3527 // to the increased register pressure. 3528 3529 // Use the user preference, unless 'auto' is selected. 3530 if (UserUF != 0) 3531 return UserUF; 3532 3533 // When we optimize for size we don't unroll. 3534 if (OptForSize) 3535 return 1; 3536 3537 // Do not unroll loops with a relatively small trip count. 3538 unsigned TC = SE->getSmallConstantTripCount(TheLoop, 3539 TheLoop->getLoopLatch()); 3540 if (TC > 1 && TC < TinyTripCountUnrollThreshold) 3541 return 1; 3542 3543 unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); 3544 DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << 3545 " vector registers\n"); 3546 3547 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 3548 // We divide by these constants so assume that we have at least one 3549 // instruction that uses at least one register. 3550 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 3551 R.NumInstructions = std::max(R.NumInstructions, 1U); 3552 3553 // We calculate the unroll factor using the following formula. 3554 // Subtract the number of loop invariants from the number of available 3555 // registers. These registers are used by all of the unrolled instances. 3556 // Next, divide the remaining registers by the number of registers that is 3557 // required by the loop, in order to estimate how many parallel instances 3558 // fit without causing spills. 3559 unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; 3560 3561 // Clamp the unroll factor ranges to reasonable factors. 3562 unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); 3563 3564 // If we did not calculate the cost for VF (because the user selected the VF) 3565 // then we calculate the cost of VF here. 3566 if (LoopCost == 0) 3567 LoopCost = expectedCost(VF); 3568 3569 // Clamp the calculated UF to be between the 1 and the max unroll factor 3570 // that the target allows. 3571 if (UF > MaxUnrollSize) 3572 UF = MaxUnrollSize; 3573 else if (UF < 1) 3574 UF = 1; 3575 3576 if (Legal->getReductionVars()->size()) { 3577 DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); 3578 return UF; 3579 } 3580 3581 // We want to unroll tiny loops in order to reduce the loop overhead. 3582 // We assume that the cost overhead is 1 and we use the cost model 3583 // to estimate the cost of the loop and unroll until the cost of the 3584 // loop overhead is about 5% of the cost of the loop. 3585 DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); 3586 if (LoopCost < 20) { 3587 DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); 3588 unsigned NewUF = 20/LoopCost + 1; 3589 return std::min(NewUF, UF); 3590 } 3591 3592 DEBUG(dbgs() << "LV: Not Unrolling. \n"); 3593 return 1; 3594} 3595 3596LoopVectorizationCostModel::RegisterUsage 3597LoopVectorizationCostModel::calculateRegisterUsage() { 3598 // This function calculates the register usage by measuring the highest number 3599 // of values that are alive at a single location. Obviously, this is a very 3600 // rough estimation. We scan the loop in a topological order in order and 3601 // assign a number to each instruction. We use RPO to ensure that defs are 3602 // met before their users. We assume that each instruction that has in-loop 3603 // users starts an interval. We record every time that an in-loop value is 3604 // used, so we have a list of the first and last occurrences of each 3605 // instruction. Next, we transpose this data structure into a multi map that 3606 // holds the list of intervals that *end* at a specific location. This multi 3607 // map allows us to perform a linear search. We scan the instructions linearly 3608 // and record each time that a new interval starts, by placing it in a set. 3609 // If we find this value in the multi-map then we remove it from the set. 3610 // The max register usage is the maximum size of the set. 3611 // We also search for instructions that are defined outside the loop, but are 3612 // used inside the loop. We need this number separately from the max-interval 3613 // usage number because when we unroll, loop-invariant values do not take 3614 // more register. 3615 LoopBlocksDFS DFS(TheLoop); 3616 DFS.perform(LI); 3617 3618 RegisterUsage R; 3619 R.NumInstructions = 0; 3620 3621 // Each 'key' in the map opens a new interval. The values 3622 // of the map are the index of the 'last seen' usage of the 3623 // instruction that is the key. 3624 typedef DenseMap<Instruction*, unsigned> IntervalMap; 3625 // Maps instruction to its index. 3626 DenseMap<unsigned, Instruction*> IdxToInstr; 3627 // Marks the end of each interval. 3628 IntervalMap EndPoint; 3629 // Saves the list of instruction indices that are used in the loop. 3630 SmallSet<Instruction*, 8> Ends; 3631 // Saves the list of values that are used in the loop but are 3632 // defined outside the loop, such as arguments and constants. 3633 SmallPtrSet<Value*, 8> LoopInvariants; 3634 3635 unsigned Index = 0; 3636 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 3637 be = DFS.endRPO(); bb != be; ++bb) { 3638 R.NumInstructions += (*bb)->size(); 3639 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 3640 ++it) { 3641 Instruction *I = it; 3642 IdxToInstr[Index++] = I; 3643 3644 // Save the end location of each USE. 3645 for (unsigned i = 0; i < I->getNumOperands(); ++i) { 3646 Value *U = I->getOperand(i); 3647 Instruction *Instr = dyn_cast<Instruction>(U); 3648 3649 // Ignore non-instruction values such as arguments, constants, etc. 3650 if (!Instr) continue; 3651 3652 // If this instruction is outside the loop then record it and continue. 3653 if (!TheLoop->contains(Instr)) { 3654 LoopInvariants.insert(Instr); 3655 continue; 3656 } 3657 3658 // Overwrite previous end points. 3659 EndPoint[Instr] = Index; 3660 Ends.insert(Instr); 3661 } 3662 } 3663 } 3664 3665 // Saves the list of intervals that end with the index in 'key'. 3666 typedef SmallVector<Instruction*, 2> InstrList; 3667 DenseMap<unsigned, InstrList> TransposeEnds; 3668 3669 // Transpose the EndPoints to a list of values that end at each index. 3670 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 3671 it != e; ++it) 3672 TransposeEnds[it->second].push_back(it->first); 3673 3674 SmallSet<Instruction*, 8> OpenIntervals; 3675 unsigned MaxUsage = 0; 3676 3677 3678 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 3679 for (unsigned int i = 0; i < Index; ++i) { 3680 Instruction *I = IdxToInstr[i]; 3681 // Ignore instructions that are never used within the loop. 3682 if (!Ends.count(I)) continue; 3683 3684 // Remove all of the instructions that end at this location. 3685 InstrList &List = TransposeEnds[i]; 3686 for (unsigned int j=0, e = List.size(); j < e; ++j) 3687 OpenIntervals.erase(List[j]); 3688 3689 // Count the number of live interals. 3690 MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 3691 3692 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 3693 OpenIntervals.size() <<"\n"); 3694 3695 // Add the current instruction to the list of open intervals. 3696 OpenIntervals.insert(I); 3697 } 3698 3699 unsigned Invariant = LoopInvariants.size(); 3700 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); 3701 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); 3702 DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); 3703 3704 R.LoopInvariantRegs = Invariant; 3705 R.MaxLocalUsers = MaxUsage; 3706 return R; 3707} 3708 3709unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 3710 unsigned Cost = 0; 3711 3712 // For each block. 3713 for (Loop::block_iterator bb = TheLoop->block_begin(), 3714 be = TheLoop->block_end(); bb != be; ++bb) { 3715 unsigned BlockCost = 0; 3716 BasicBlock *BB = *bb; 3717 3718 // For each instruction in the old loop. 3719 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3720 // Skip dbg intrinsics. 3721 if (isa<DbgInfoIntrinsic>(it)) 3722 continue; 3723 3724 unsigned C = getInstructionCost(it, VF); 3725 Cost += C; 3726 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 3727 VF << " For instruction: "<< *it << "\n"); 3728 } 3729 3730 // We assume that if-converted blocks have a 50% chance of being executed. 3731 // When the code is scalar then some of the blocks are avoided due to CF. 3732 // When the code is vectorized we execute all code paths. 3733 if (Legal->blockNeedsPredication(*bb) && VF == 1) 3734 BlockCost /= 2; 3735 3736 Cost += BlockCost; 3737 } 3738 3739 return Cost; 3740} 3741 3742unsigned 3743LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 3744 // If we know that this instruction will remain uniform, check the cost of 3745 // the scalar version. 3746 if (Legal->isUniformAfterVectorization(I)) 3747 VF = 1; 3748 3749 Type *RetTy = I->getType(); 3750 Type *VectorTy = ToVectorTy(RetTy, VF); 3751 3752 // TODO: We need to estimate the cost of intrinsic calls. 3753 switch (I->getOpcode()) { 3754 case Instruction::GetElementPtr: 3755 // We mark this instruction as zero-cost because the cost of GEPs in 3756 // vectorized code depends on whether the corresponding memory instruction 3757 // is scalarized or not. Therefore, we handle GEPs with the memory 3758 // instruction cost. 3759 return 0; 3760 case Instruction::Br: { 3761 return TTI.getCFInstrCost(I->getOpcode()); 3762 } 3763 case Instruction::PHI: 3764 //TODO: IF-converted IFs become selects. 3765 return 0; 3766 case Instruction::Add: 3767 case Instruction::FAdd: 3768 case Instruction::Sub: 3769 case Instruction::FSub: 3770 case Instruction::Mul: 3771 case Instruction::FMul: 3772 case Instruction::UDiv: 3773 case Instruction::SDiv: 3774 case Instruction::FDiv: 3775 case Instruction::URem: 3776 case Instruction::SRem: 3777 case Instruction::FRem: 3778 case Instruction::Shl: 3779 case Instruction::LShr: 3780 case Instruction::AShr: 3781 case Instruction::And: 3782 case Instruction::Or: 3783 case Instruction::Xor: { 3784 // Certain instructions can be cheaper to vectorize if they have a constant 3785 // second vector operand. One example of this are shifts on x86. 3786 TargetTransformInfo::OperandValueKind Op1VK = 3787 TargetTransformInfo::OK_AnyValue; 3788 TargetTransformInfo::OperandValueKind Op2VK = 3789 TargetTransformInfo::OK_AnyValue; 3790 3791 if (isa<ConstantInt>(I->getOperand(1))) 3792 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 3793 3794 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); 3795 } 3796 case Instruction::Select: { 3797 SelectInst *SI = cast<SelectInst>(I); 3798 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 3799 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 3800 Type *CondTy = SI->getCondition()->getType(); 3801 if (!ScalarCond) 3802 CondTy = VectorType::get(CondTy, VF); 3803 3804 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 3805 } 3806 case Instruction::ICmp: 3807 case Instruction::FCmp: { 3808 Type *ValTy = I->getOperand(0)->getType(); 3809 VectorTy = ToVectorTy(ValTy, VF); 3810 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 3811 } 3812 case Instruction::Store: 3813 case Instruction::Load: { 3814 StoreInst *SI = dyn_cast<StoreInst>(I); 3815 LoadInst *LI = dyn_cast<LoadInst>(I); 3816 Type *ValTy = (SI ? SI->getValueOperand()->getType() : 3817 LI->getType()); 3818 VectorTy = ToVectorTy(ValTy, VF); 3819 3820 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 3821 unsigned AS = SI ? SI->getPointerAddressSpace() : 3822 LI->getPointerAddressSpace(); 3823 Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 3824 // We add the cost of address computation here instead of with the gep 3825 // instruction because only here we know whether the operation is 3826 // scalarized. 3827 if (VF == 1) 3828 return TTI.getAddressComputationCost(VectorTy) + 3829 TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 3830 3831 // Scalarized loads/stores. 3832 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 3833 bool Reverse = ConsecutiveStride < 0; 3834 unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); 3835 unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; 3836 if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { 3837 unsigned Cost = 0; 3838 // The cost of extracting from the value vector and pointer vector. 3839 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 3840 for (unsigned i = 0; i < VF; ++i) { 3841 // The cost of extracting the pointer operand. 3842 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 3843 // In case of STORE, the cost of ExtractElement from the vector. 3844 // In case of LOAD, the cost of InsertElement into the returned 3845 // vector. 3846 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 3847 Instruction::InsertElement, 3848 VectorTy, i); 3849 } 3850 3851 // The cost of the scalar loads/stores. 3852 Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); 3853 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 3854 Alignment, AS); 3855 return Cost; 3856 } 3857 3858 // Wide load/stores. 3859 unsigned Cost = TTI.getAddressComputationCost(VectorTy); 3860 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 3861 3862 if (Reverse) 3863 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3864 VectorTy, 0); 3865 return Cost; 3866 } 3867 case Instruction::ZExt: 3868 case Instruction::SExt: 3869 case Instruction::FPToUI: 3870 case Instruction::FPToSI: 3871 case Instruction::FPExt: 3872 case Instruction::PtrToInt: 3873 case Instruction::IntToPtr: 3874 case Instruction::SIToFP: 3875 case Instruction::UIToFP: 3876 case Instruction::Trunc: 3877 case Instruction::FPTrunc: 3878 case Instruction::BitCast: { 3879 // We optimize the truncation of induction variable. 3880 // The cost of these is the same as the scalar operation. 3881 if (I->getOpcode() == Instruction::Trunc && 3882 Legal->isInductionVariable(I->getOperand(0))) 3883 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 3884 I->getOperand(0)->getType()); 3885 3886 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 3887 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 3888 } 3889 case Instruction::Call: { 3890 CallInst *CI = cast<CallInst>(I); 3891 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3892 assert(ID && "Not an intrinsic call!"); 3893 Type *RetTy = ToVectorTy(CI->getType(), VF); 3894 SmallVector<Type*, 4> Tys; 3895 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3896 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3897 return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 3898 } 3899 default: { 3900 // We are scalarizing the instruction. Return the cost of the scalar 3901 // instruction, plus the cost of insert and extract into vector 3902 // elements, times the vector width. 3903 unsigned Cost = 0; 3904 3905 if (!RetTy->isVoidTy() && VF != 1) { 3906 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 3907 VectorTy); 3908 unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 3909 VectorTy); 3910 3911 // The cost of inserting the results plus extracting each one of the 3912 // operands. 3913 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 3914 } 3915 3916 // The cost of executing VF copies of the scalar instruction. This opcode 3917 // is unknown. Assume that it is the same as 'mul'. 3918 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 3919 return Cost; 3920 } 3921 }// end of switch. 3922} 3923 3924Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 3925 if (Scalar->isVoidTy() || VF == 1) 3926 return Scalar; 3927 return VectorType::get(Scalar, VF); 3928} 3929 3930char LoopVectorize::ID = 0; 3931static const char lv_name[] = "Loop Vectorization"; 3932INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 3933INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3934INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3935INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3936INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 3937INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 3938 3939namespace llvm { 3940 Pass *createLoopVectorizePass() { 3941 return new LoopVectorize(); 3942 } 3943} 3944 3945bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 3946 // Check for a store. 3947 if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 3948 return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 3949 3950 // Check for a load. 3951 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 3952 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 3953 3954 return false; 3955} 3956