LoopVectorize.cpp revision 8c6b73666bdd08f15b31c00bd2fd663b632a1d65
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. Legalization of the IR is done 12// in the codegen. However, the vectorizes uses (will use) the codegen 13// interfaces to generate IR that is likely to result in an optimal binary. 14// 15// The loop vectorizer combines consecutive loop iteration into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. SingleBlockLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28// 29//===----------------------------------------------------------------------===// 30// 31// The reduction-variable vectorization is based on the paper: 32// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33// 34// Variable uniformity checks are inspired by: 35// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36// 37// Other ideas/concepts are from: 38// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39// 40// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41// Vectorizing Compilers. 42// 43//===----------------------------------------------------------------------===// 44#define LV_NAME "loop-vectorize" 45#define DEBUG_TYPE LV_NAME 46#include "llvm/Constants.h" 47#include "llvm/DerivedTypes.h" 48#include "llvm/Instructions.h" 49#include "llvm/LLVMContext.h" 50#include "llvm/Pass.h" 51#include "llvm/Analysis/LoopPass.h" 52#include "llvm/Value.h" 53#include "llvm/Function.h" 54#include "llvm/Analysis/Verifier.h" 55#include "llvm/Module.h" 56#include "llvm/Type.h" 57#include "llvm/ADT/SmallVector.h" 58#include "llvm/ADT/StringExtras.h" 59#include "llvm/Analysis/AliasAnalysis.h" 60#include "llvm/Analysis/AliasSetTracker.h" 61#include "llvm/Analysis/ScalarEvolution.h" 62#include "llvm/Analysis/Dominators.h" 63#include "llvm/Analysis/ScalarEvolutionExpressions.h" 64#include "llvm/Analysis/ScalarEvolutionExpander.h" 65#include "llvm/Analysis/LoopInfo.h" 66#include "llvm/Analysis/ValueTracking.h" 67#include "llvm/Transforms/Scalar.h" 68#include "llvm/Transforms/Utils/BasicBlockUtils.h" 69#include "llvm/TargetTransformInfo.h" 70#include "llvm/Support/CommandLine.h" 71#include "llvm/Support/Debug.h" 72#include "llvm/Support/raw_ostream.h" 73#include "llvm/DataLayout.h" 74#include "llvm/Transforms/Utils/Local.h" 75#include <algorithm> 76using namespace llvm; 77 78static cl::opt<unsigned> 79VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 80 cl::desc("Set the default vectorization width. Zero is autoselect.")); 81 82/// We don't vectorize loops with a known constant trip count below this number. 83const unsigned TinyTripCountThreshold = 16; 84 85/// When performing a runtime memory check, do not check more than this 86/// number of pointers. Notice that the check is quadratic! 87const unsigned RuntimeMemoryCheckThreshold = 2; 88 89namespace { 90 91// Forward declarations. 92class LoopVectorizationLegality; 93class LoopVectorizationCostModel; 94 95/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 96/// block to a specified vectorization factor (VF). 97/// This class performs the widening of scalars into vectors, or multiple 98/// scalars. This class also implements the following features: 99/// * It inserts an epilogue loop for handling loops that don't have iteration 100/// counts that are known to be a multiple of the vectorization factor. 101/// * It handles the code generation for reduction variables. 102/// * Scalarization (implementation using scalars) of un-vectorizable 103/// instructions. 104/// SingleBlockLoopVectorizer does not perform any vectorization-legality 105/// checks, and relies on the caller to check for the different legality 106/// aspects. The SingleBlockLoopVectorizer relies on the 107/// LoopVectorizationLegality class to provide information about the induction 108/// and reduction variables that were found to a given vectorization factor. 109class SingleBlockLoopVectorizer { 110public: 111 /// Ctor. 112 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 113 DominatorTree *dt, DataLayout *dl, 114 LPPassManager *Lpm, 115 unsigned VecWidth): 116 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth), 117 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 118 119 // Perform the actual loop widening (vectorization). 120 void vectorize(LoopVectorizationLegality *Legal) { 121 ///Create a new empty loop. Unlink the old loop and connect the new one. 122 createEmptyLoop(Legal); 123 /// Widen each instruction in the old loop to a new one in the new loop. 124 /// Use the Legality module to find the induction and reduction variables. 125 vectorizeLoop(Legal); 126 // Register the new loop and update the analysis passes. 127 updateAnalysis(); 128 } 129 130private: 131 /// Add code that checks at runtime if the accessed arrays overlap. 132 /// Returns the comperator value or NULL if no check is needed. 133 Value* addRuntimeCheck(LoopVectorizationLegality *Legal, 134 Instruction *Loc); 135 /// Create an empty loop, based on the loop ranges of the old loop. 136 void createEmptyLoop(LoopVectorizationLegality *Legal); 137 /// Copy and widen the instructions from the old loop. 138 void vectorizeLoop(LoopVectorizationLegality *Legal); 139 /// Insert the new loop to the loop hierarchy and pass manager 140 /// and update the analysis passes. 141 void updateAnalysis(); 142 143 /// This instruction is un-vectorizable. Implement it as a sequence 144 /// of scalars. 145 void scalarizeInstruction(Instruction *Instr); 146 147 /// Create a broadcast instruction. This method generates a broadcast 148 /// instruction (shuffle) for loop invariant values and for the induction 149 /// value. If this is the induction variable then we extend it to N, N+1, ... 150 /// this is needed because each iteration in the loop corresponds to a SIMD 151 /// element. 152 Value *getBroadcastInstrs(Value *V); 153 154 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 155 /// for each element in the vector. Starting from zero. 156 Value *getConsecutiveVector(Value* Val); 157 158 /// When we go over instructions in the basic block we rely on previous 159 /// values within the current basic block or on loop invariant values. 160 /// When we widen (vectorize) values we place them in the map. If the values 161 /// are not within the map, they have to be loop invariant, so we simply 162 /// broadcast them into a vector. 163 Value *getVectorValue(Value *V); 164 165 /// Get a uniform vector of constant integers. We use this to get 166 /// vectors of ones and zeros for the reduction code. 167 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 168 169 typedef DenseMap<Value*, Value*> ValueMap; 170 171 /// The original loop. 172 Loop *OrigLoop; 173 // Scev analysis to use. 174 ScalarEvolution *SE; 175 // Loop Info. 176 LoopInfo *LI; 177 // Dominator Tree. 178 DominatorTree *DT; 179 // Data Layout; 180 DataLayout *DL; 181 // Loop Pass Manager; 182 LPPassManager *LPM; 183 // The vectorization factor to use. 184 unsigned VF; 185 186 // The builder that we use 187 IRBuilder<> Builder; 188 189 // --- Vectorization state --- 190 191 /// The vector-loop preheader. 192 BasicBlock *LoopVectorPreHeader; 193 /// The scalar-loop preheader. 194 BasicBlock *LoopScalarPreHeader; 195 /// Middle Block between the vector and the scalar. 196 BasicBlock *LoopMiddleBlock; 197 ///The ExitBlock of the scalar loop. 198 BasicBlock *LoopExitBlock; 199 ///The vector loop body. 200 BasicBlock *LoopVectorBody; 201 ///The scalar loop body. 202 BasicBlock *LoopScalarBody; 203 ///The first bypass block. 204 BasicBlock *LoopBypassBlock; 205 206 /// The new Induction variable which was added to the new block. 207 PHINode *Induction; 208 /// The induction variable of the old basic block. 209 PHINode *OldInduction; 210 // Maps scalars to widened vectors. 211 ValueMap WidenMap; 212}; 213 214/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 215/// to what vectorization factor. 216/// This class does not look at the profitability of vectorization, only the 217/// legality. This class has two main kinds of checks: 218/// * Memory checks - The code in canVectorizeMemory checks if vectorization 219/// will change the order of memory accesses in a way that will change the 220/// correctness of the program. 221/// * Scalars checks - The code in canVectorizeBlock checks for a number 222/// of different conditions, such as the availability of a single induction 223/// variable, that all types are supported and vectorize-able, etc. 224/// This code reflects the capabilities of SingleBlockLoopVectorizer. 225/// This class is also used by SingleBlockLoopVectorizer for identifying 226/// induction variable and the different reduction variables. 227class LoopVectorizationLegality { 228public: 229 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 230 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 231 232 /// This represents the kinds of reductions that we support. 233 enum ReductionKind { 234 NoReduction, /// Not a reduction. 235 IntegerAdd, /// Sum of numbers. 236 IntegerMult, /// Product of numbers. 237 IntegerOr, /// Bitwise or logical OR of numbers. 238 IntegerAnd, /// Bitwise or logical AND of numbers. 239 IntegerXor /// Bitwise or logical XOR of numbers. 240 }; 241 242 /// This POD struct holds information about reduction variables. 243 struct ReductionDescriptor { 244 // Default C'tor 245 ReductionDescriptor(): 246 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 247 248 // C'tor. 249 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 250 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 251 252 // The starting value of the reduction. 253 // It does not have to be zero! 254 Value *StartValue; 255 // The instruction who's value is used outside the loop. 256 Instruction *LoopExitInstr; 257 // The kind of the reduction. 258 ReductionKind Kind; 259 }; 260 261 // This POD struct holds information about the memory runtime legality 262 // check that a group of pointers do not overlap. 263 struct RuntimePointerCheck { 264 RuntimePointerCheck(): Need(false) {} 265 266 /// Reset the state of the pointer runtime information. 267 void reset() { 268 Need = false; 269 Pointers.clear(); 270 Starts.clear(); 271 Ends.clear(); 272 } 273 274 /// Insert a pointer and calculate the start and end SCEVs. 275 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { 276 const SCEV *Sc = SE->getSCEV(Ptr); 277 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 278 assert(AR && "Invalid addrec expression"); 279 const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); 280 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 281 Pointers.push_back(Ptr); 282 Starts.push_back(AR->getStart()); 283 Ends.push_back(ScEnd); 284 } 285 286 /// This flag indicates if we need to add the runtime check. 287 bool Need; 288 /// Holds the pointers that we need to check. 289 SmallVector<Value*, 2> Pointers; 290 /// Holds the pointer value at the beginning of the loop. 291 SmallVector<const SCEV*, 2> Starts; 292 /// Holds the pointer value at the end of the loop. 293 SmallVector<const SCEV*, 2> Ends; 294 }; 295 296 /// ReductionList contains the reduction descriptors for all 297 /// of the reductions that were found in the loop. 298 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 299 300 /// InductionList saves induction variables and maps them to the initial 301 /// value entring the loop. 302 typedef DenseMap<PHINode*, Value*> InductionList; 303 304 /// Returns true if it is legal to vectorize this loop. 305 /// This does not mean that it is profitable to vectorize this 306 /// loop, only that it is legal to do so. 307 bool canVectorize(); 308 309 /// Returns the Induction variable. 310 PHINode *getInduction() {return Induction;} 311 312 /// Returns the reduction variables found in the loop. 313 ReductionList *getReductionVars() { return &Reductions; } 314 315 /// Returns the induction variables found in the loop. 316 InductionList *getInductionVars() { return &Inductions; } 317 318 /// Check if this pointer is consecutive when vectorizing. This happens 319 /// when the last index of the GEP is the induction variable, or that the 320 /// pointer itself is an induction variable. 321 /// This check allows us to vectorize A[idx] into a wide load/store. 322 bool isConsecutivePtr(Value *Ptr); 323 324 /// Returns true if the value V is uniform within the loop. 325 bool isUniform(Value *V); 326 327 /// Returns true if this instruction will remain scalar after vectorization. 328 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 329 330 /// Returns the information that we collected about runtime memory check. 331 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 332private: 333 /// Check if a single basic block loop is vectorizable. 334 /// At this point we know that this is a loop with a constant trip count 335 /// and we only need to check individual instructions. 336 bool canVectorizeBlock(BasicBlock &BB); 337 338 /// When we vectorize loops we may change the order in which 339 /// we read and write from memory. This method checks if it is 340 /// legal to vectorize the code, considering only memory constrains. 341 /// Returns true if BB is vectorizable 342 bool canVectorizeMemory(BasicBlock &BB); 343 344 /// Returns True, if 'Phi' is the kind of reduction variable for type 345 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 346 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 347 /// Returns true if the instruction I can be a reduction variable of type 348 /// 'Kind'. 349 bool isReductionInstr(Instruction *I, ReductionKind Kind); 350 /// Returns True, if 'Phi' is an induction variable. 351 bool isInductionVariable(PHINode *Phi); 352 /// Return true if can compute the address bounds of Ptr within the loop. 353 bool hasComputableBounds(Value *Ptr); 354 355 /// The loop that we evaluate. 356 Loop *TheLoop; 357 /// Scev analysis. 358 ScalarEvolution *SE; 359 /// DataLayout analysis. 360 DataLayout *DL; 361 362 // --- vectorization state --- // 363 364 /// Holds the integer induction variable. This is the counter of the 365 /// loop. 366 PHINode *Induction; 367 /// Holds the reduction variables. 368 ReductionList Reductions; 369 /// Holds all of the induction variables that we found in the loop. 370 /// Notice that inductions don't need to start at zero and that induction 371 /// variables can be pointers. 372 InductionList Inductions; 373 374 /// Allowed outside users. This holds the reduction 375 /// vars which can be accessed from outside the loop. 376 SmallPtrSet<Value*, 4> AllowedExit; 377 /// This set holds the variables which are known to be uniform after 378 /// vectorization. 379 SmallPtrSet<Instruction*, 4> Uniforms; 380 /// We need to check that all of the pointers in this list are disjoint 381 /// at runtime. 382 RuntimePointerCheck PtrRtCheck; 383}; 384 385/// LoopVectorizationCostModel - estimates the expected speedups due to 386/// vectorization. 387/// In many cases vectorization is not profitable. This can happen because 388/// of a number of reasons. In this class we mainly attempt to predict 389/// the expected speedup/slowdowns due to the supported instruction set. 390/// We use the VectorTargetTransformInfo to query the different backends 391/// for the cost of different operations. 392class LoopVectorizationCostModel { 393public: 394 /// C'tor. 395 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 396 LoopVectorizationLegality *Leg, 397 const VectorTargetTransformInfo *Vtti): 398 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 399 400 /// Returns the most profitable vectorization factor for the loop that is 401 /// smaller or equal to the VF argument. This method checks every power 402 /// of two up to VF. 403 unsigned findBestVectorizationFactor(unsigned VF = 8); 404 405private: 406 /// Returns the expected execution cost. The unit of the cost does 407 /// not matter because we use the 'cost' units to compare different 408 /// vector widths. The cost that is returned is *not* normalized by 409 /// the factor width. 410 unsigned expectedCost(unsigned VF); 411 412 /// Returns the execution time cost of an instruction for a given vector 413 /// width. Vector width of one means scalar. 414 unsigned getInstructionCost(Instruction *I, unsigned VF); 415 416 /// A helper function for converting Scalar types to vector types. 417 /// If the incoming type is void, we return void. If the VF is 1, we return 418 /// the scalar type. 419 static Type* ToVectorTy(Type *Scalar, unsigned VF); 420 421 /// The loop that we evaluate. 422 Loop *TheLoop; 423 /// Scev analysis. 424 ScalarEvolution *SE; 425 426 /// Vectorization legality. 427 LoopVectorizationLegality *Legal; 428 /// Vector target information. 429 const VectorTargetTransformInfo *VTTI; 430}; 431 432struct LoopVectorize : public LoopPass { 433 static char ID; // Pass identification, replacement for typeid 434 435 LoopVectorize() : LoopPass(ID) { 436 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 437 } 438 439 ScalarEvolution *SE; 440 DataLayout *DL; 441 LoopInfo *LI; 442 TargetTransformInfo *TTI; 443 DominatorTree *DT; 444 445 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 446 // We only vectorize innermost loops. 447 if (!L->empty()) 448 return false; 449 450 SE = &getAnalysis<ScalarEvolution>(); 451 DL = getAnalysisIfAvailable<DataLayout>(); 452 LI = &getAnalysis<LoopInfo>(); 453 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 454 DT = &getAnalysis<DominatorTree>(); 455 456 DEBUG(dbgs() << "LV: Checking a loop in \"" << 457 L->getHeader()->getParent()->getName() << "\"\n"); 458 459 // Check if it is legal to vectorize the loop. 460 LoopVectorizationLegality LVL(L, SE, DL); 461 if (!LVL.canVectorize()) { 462 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 463 return false; 464 } 465 466 // Select the preffered vectorization factor. 467 unsigned VF = 1; 468 if (VectorizationFactor == 0) { 469 const VectorTargetTransformInfo *VTTI = 0; 470 if (TTI) 471 VTTI = TTI->getVectorTargetTransformInfo(); 472 // Use the cost model. 473 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 474 VF = CM.findBestVectorizationFactor(); 475 476 if (VF == 1) { 477 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 478 return false; 479 } 480 481 } else { 482 // Use the user command flag. 483 VF = VectorizationFactor; 484 } 485 486 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 487 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 488 "\n"); 489 490 // If we decided that it is *legal* to vectorizer the loop then do it. 491 SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF); 492 LB.vectorize(&LVL); 493 494 DEBUG(verifyFunction(*L->getHeader()->getParent())); 495 return true; 496 } 497 498 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 499 LoopPass::getAnalysisUsage(AU); 500 AU.addRequiredID(LoopSimplifyID); 501 AU.addRequiredID(LCSSAID); 502 AU.addRequired<LoopInfo>(); 503 AU.addRequired<ScalarEvolution>(); 504 AU.addRequired<DominatorTree>(); 505 AU.addPreserved<LoopInfo>(); 506 AU.addPreserved<DominatorTree>(); 507 } 508 509}; 510 511Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 512 // Create the types. 513 LLVMContext &C = V->getContext(); 514 Type *VTy = VectorType::get(V->getType(), VF); 515 Type *I32 = IntegerType::getInt32Ty(C); 516 Constant *Zero = ConstantInt::get(I32, 0); 517 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 518 Value *UndefVal = UndefValue::get(VTy); 519 // Insert the value into a new vector. 520 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 521 // Broadcast the scalar into all locations in the vector. 522 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 523 "broadcast"); 524 // We are accessing the induction variable. Make sure to promote the 525 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 526 if (V == Induction) 527 return getConsecutiveVector(Shuf); 528 return Shuf; 529} 530 531Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 532 assert(Val->getType()->isVectorTy() && "Must be a vector"); 533 assert(Val->getType()->getScalarType()->isIntegerTy() && 534 "Elem must be an integer"); 535 // Create the types. 536 Type *ITy = Val->getType()->getScalarType(); 537 VectorType *Ty = cast<VectorType>(Val->getType()); 538 unsigned VLen = Ty->getNumElements(); 539 SmallVector<Constant*, 8> Indices; 540 541 // Create a vector of consecutive numbers from zero to VF. 542 for (unsigned i = 0; i < VLen; ++i) 543 Indices.push_back(ConstantInt::get(ITy, i)); 544 545 // Add the consecutive indices to the vector value. 546 Constant *Cv = ConstantVector::get(Indices); 547 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 548 return Builder.CreateAdd(Val, Cv, "induction"); 549} 550 551bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 552 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 553 554 // If this pointer is an induction variable, return it. 555 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 556 if (Phi && getInductionVars()->count(Phi)) 557 return true; 558 559 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 560 if (!Gep) 561 return false; 562 563 unsigned NumOperands = Gep->getNumOperands(); 564 Value *LastIndex = Gep->getOperand(NumOperands - 1); 565 566 // Check that all of the gep indices are uniform except for the last. 567 for (unsigned i = 0; i < NumOperands - 1; ++i) 568 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 569 return false; 570 571 // We can emit wide load/stores only of the last index is the induction 572 // variable. 573 const SCEV *Last = SE->getSCEV(LastIndex); 574 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 575 const SCEV *Step = AR->getStepRecurrence(*SE); 576 577 // The memory is consecutive because the last index is consecutive 578 // and all other indices are loop invariant. 579 if (Step->isOne()) 580 return true; 581 } 582 583 return false; 584} 585 586bool LoopVectorizationLegality::isUniform(Value *V) { 587 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 588} 589 590Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 591 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 592 // If we saved a vectorized copy of V, use it. 593 Value *&MapEntry = WidenMap[V]; 594 if (MapEntry) 595 return MapEntry; 596 597 // Broadcast V and save the value for future uses. 598 Value *B = getBroadcastInstrs(V); 599 MapEntry = B; 600 return B; 601} 602 603Constant* 604SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 605 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 606} 607 608void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 609 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 610 // Holds vector parameters or scalars, in case of uniform vals. 611 SmallVector<Value*, 8> Params; 612 613 // Find all of the vectorized parameters. 614 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 615 Value *SrcOp = Instr->getOperand(op); 616 617 // If we are accessing the old induction variable, use the new one. 618 if (SrcOp == OldInduction) { 619 Params.push_back(getVectorValue(Induction)); 620 continue; 621 } 622 623 // Try using previously calculated values. 624 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 625 626 // If the src is an instruction that appeared earlier in the basic block 627 // then it should already be vectorized. 628 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 629 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 630 // The parameter is a vector value from earlier. 631 Params.push_back(WidenMap[SrcInst]); 632 } else { 633 // The parameter is a scalar from outside the loop. Maybe even a constant. 634 Params.push_back(SrcOp); 635 } 636 } 637 638 assert(Params.size() == Instr->getNumOperands() && 639 "Invalid number of operands"); 640 641 // Does this instruction return a value ? 642 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 643 Value *VecResults = 0; 644 645 // If we have a return value, create an empty vector. We place the scalarized 646 // instructions in this vector. 647 if (!IsVoidRetTy) 648 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 649 650 // For each scalar that we create: 651 for (unsigned i = 0; i < VF; ++i) { 652 Instruction *Cloned = Instr->clone(); 653 if (!IsVoidRetTy) 654 Cloned->setName(Instr->getName() + ".cloned"); 655 // Replace the operands of the cloned instrucions with extracted scalars. 656 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 657 Value *Op = Params[op]; 658 // Param is a vector. Need to extract the right lane. 659 if (Op->getType()->isVectorTy()) 660 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 661 Cloned->setOperand(op, Op); 662 } 663 664 // Place the cloned scalar in the new loop. 665 Builder.Insert(Cloned); 666 667 // If the original scalar returns a value we need to place it in a vector 668 // so that future users will be able to use it. 669 if (!IsVoidRetTy) 670 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 671 Builder.getInt32(i)); 672 } 673 674 if (!IsVoidRetTy) 675 WidenMap[Instr] = VecResults; 676} 677 678Value* 679SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 680 Instruction *Loc) { 681 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 682 Legal->getRuntimePointerCheck(); 683 684 if (!PtrRtCheck->Need) 685 return NULL; 686 687 Value *MemoryRuntimeCheck = 0; 688 unsigned NumPointers = PtrRtCheck->Pointers.size(); 689 SmallVector<Value* , 2> Starts; 690 SmallVector<Value* , 2> Ends; 691 692 SCEVExpander Exp(*SE, "induction"); 693 694 // Use this type for pointer arithmetic. 695 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 696 697 for (unsigned i=0; i < NumPointers; ++i) { 698 Value *Ptr = PtrRtCheck->Pointers[i]; 699 const SCEV *Sc = SE->getSCEV(Ptr); 700 701 if (SE->isLoopInvariant(Sc, OrigLoop)) { 702 DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << 703 *Ptr <<"\n"); 704 Starts.push_back(Ptr); 705 Ends.push_back(Ptr); 706 } else { 707 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 708 709 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], 710 PtrArithTy, Loc); 711 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 712 Starts.push_back(Start); 713 Ends.push_back(End); 714 } 715 } 716 717 for (unsigned i = 0; i < NumPointers; ++i) { 718 for (unsigned j = i+1; j < NumPointers; ++j) { 719 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 720 Starts[i], Ends[j], "bound0", Loc); 721 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 722 Starts[j], Ends[i], "bound1", Loc); 723 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 724 "found.conflict", Loc); 725 if (MemoryRuntimeCheck) { 726 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 727 MemoryRuntimeCheck, 728 IsConflict, 729 "conflict.rdx", Loc); 730 } else { 731 MemoryRuntimeCheck = IsConflict; 732 } 733 } 734 } 735 736 return MemoryRuntimeCheck; 737} 738 739void 740SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 741 /* 742 In this function we generate a new loop. The new loop will contain 743 the vectorized instructions while the old loop will continue to run the 744 scalar remainder. 745 746 [ ] <-- vector loop bypass. 747 / | 748 / v 749| [ ] <-- vector pre header. 750| | 751| v 752| [ ] \ 753| [ ]_| <-- vector loop. 754| | 755 \ v 756 >[ ] <--- middle-block. 757 / | 758 / v 759| [ ] <--- new preheader. 760| | 761| v 762| [ ] \ 763| [ ]_| <-- old scalar loop to handle remainder. 764 \ | 765 \ v 766 >[ ] <-- exit block. 767 ... 768 */ 769 770 // Some loops have a single integer induction variable, while other loops 771 // don't. One example is c++ iterators that often have multiple pointer 772 // induction variables. In the code below we also support a case where we 773 // don't have a single induction variable. 774 OldInduction = Legal->getInduction(); 775 Type *IdxTy = OldInduction ? OldInduction->getType() : 776 DL->getIntPtrType(SE->getContext()); 777 778 // Find the loop boundaries. 779 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 780 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 781 782 // Get the total trip count from the count by adding 1. 783 ExitCount = SE->getAddExpr(ExitCount, 784 SE->getConstant(ExitCount->getType(), 1)); 785 786 // This is the original scalar-loop preheader. 787 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 788 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 789 assert(ExitBlock && "Must have an exit block"); 790 791 // The loop index does not have to start at Zero. Find the original start 792 // value from the induction PHI node. If we don't have an induction variable 793 // then we know that it starts at zero. 794 Value *StartIdx = OldInduction ? 795 OldInduction->getIncomingValueForBlock(BypassBlock): 796 ConstantInt::get(IdxTy, 0); 797 798 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 799 assert(BypassBlock && "Invalid loop structure"); 800 801 BasicBlock *VectorPH = 802 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 803 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 804 "vector.body"); 805 806 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 807 "middle.block"); 808 BasicBlock *ScalarPH = 809 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 810 "scalar.preheader"); 811 // Find the induction variable. 812 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 813 814 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 815 // inside the loop. 816 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 817 818 // Generate the induction variable. 819 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 820 Constant *Step = ConstantInt::get(IdxTy, VF); 821 822 // Expand the trip count and place the new instructions in the preheader. 823 // Notice that the pre-header does not change, only the loop body. 824 SCEVExpander Exp(*SE, "induction"); 825 Instruction *Loc = BypassBlock->getTerminator(); 826 827 // Count holds the overall loop count (N). 828 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc); 829 830 // We may need to extend the index in case there is a type mismatch. 831 // We know that the count starts at zero and does not overflow. 832 if (Count->getType() != IdxTy) { 833 // The exit count can be of pointer type. Convert it to the correct 834 // integer type. 835 if (ExitCount->getType()->isPointerTy()) 836 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 837 else 838 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 839 } 840 841 // Add the start index to the loop count to get the new end index. 842 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 843 844 // Now we need to generate the expression for N - (N % VF), which is 845 // the part that the vectorized body will execute. 846 Constant *CIVF = ConstantInt::get(IdxTy, VF); 847 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 848 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 849 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 850 "end.idx.rnd.down", Loc); 851 852 // Now, compare the new count to zero. If it is zero skip the vector loop and 853 // jump to the scalar loop. 854 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 855 IdxEndRoundDown, 856 StartIdx, 857 "cmp.zero", Loc); 858 859 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc); 860 861 // If we are using memory runtime checks, include them in. 862 if (MemoryRuntimeCheck) { 863 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 864 "CntOrMem", Loc); 865 } 866 867 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 868 // Remove the old terminator. 869 Loc->eraseFromParent(); 870 871 // We are going to resume the execution of the scalar loop. 872 // Go over all of the induction variables that we found and fix the 873 // PHIs that are left in the scalar version of the loop. 874 // The starting values of PHI nodes depend on the counter of the last 875 // iteration in the vectorized loop. 876 // If we come from a bypass edge then we need to start from the original start 877 // value. 878 879 // This variable saves the new starting index for the scalar loop. 880 PHINode *ResumeIndex = 0; 881 LoopVectorizationLegality::InductionList::iterator I, E; 882 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 883 for (I = List->begin(), E = List->end(); I != E; ++I) { 884 PHINode *OrigPhi = I->first; 885 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 886 MiddleBlock->getTerminator()); 887 Value *EndValue = 0; 888 if (OrigPhi->getType()->isIntegerTy()) { 889 // Handle the integer induction counter: 890 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 891 // We know what the end value is. 892 EndValue = IdxEndRoundDown; 893 // We also know which PHI node holds it. 894 ResumeIndex = ResumeVal; 895 } else { 896 // For pointer induction variables, calculate the offset using 897 // the end index. 898 EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, 899 "ptr.ind.end", 900 BypassBlock->getTerminator()); 901 } 902 903 // The new PHI merges the original incoming value, in case of a bypass, 904 // or the value at the end of the vectorized loop. 905 ResumeVal->addIncoming(I->second, BypassBlock); 906 ResumeVal->addIncoming(EndValue, VecBody); 907 908 // Fix the scalar body counter (PHI node). 909 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 910 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 911 } 912 913 // If we are generating a new induction variable then we also need to 914 // generate the code that calculates the exit value. This value is not 915 // simply the end of the counter because we may skip the vectorized body 916 // in case of a runtime check. 917 if (!OldInduction){ 918 assert(!ResumeIndex && "Unexpected resume value found"); 919 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 920 MiddleBlock->getTerminator()); 921 ResumeIndex->addIncoming(StartIdx, BypassBlock); 922 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 923 } 924 925 // Make sure that we found the index where scalar loop needs to continue. 926 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 927 "Invalid resume Index"); 928 929 // Add a check in the middle block to see if we have completed 930 // all of the iterations in the first vector loop. 931 // If (N - N%VF) == N, then we *don't* need to run the remainder. 932 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 933 ResumeIndex, "cmp.n", 934 MiddleBlock->getTerminator()); 935 936 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 937 // Remove the old terminator. 938 MiddleBlock->getTerminator()->eraseFromParent(); 939 940 // Create i+1 and fill the PHINode. 941 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 942 Induction->addIncoming(StartIdx, VectorPH); 943 Induction->addIncoming(NextIdx, VecBody); 944 // Create the compare. 945 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 946 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 947 948 // Now we have two terminators. Remove the old one from the block. 949 VecBody->getTerminator()->eraseFromParent(); 950 951 // Get ready to start creating new instructions into the vectorized body. 952 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 953 954 // Register the new loop. 955 Loop* Lp = new Loop(); 956 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 957 958 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 959 960 Loop *ParentLoop = OrigLoop->getParentLoop(); 961 if (ParentLoop) { 962 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 963 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 964 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 965 } 966 967 // Save the state. 968 LoopVectorPreHeader = VectorPH; 969 LoopScalarPreHeader = ScalarPH; 970 LoopMiddleBlock = MiddleBlock; 971 LoopExitBlock = ExitBlock; 972 LoopVectorBody = VecBody; 973 LoopScalarBody = OldBasicBlock; 974 LoopBypassBlock = BypassBlock; 975} 976 977/// This function returns the identity element (or neutral element) for 978/// the operation K. 979static unsigned 980getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 981 switch (K) { 982 case LoopVectorizationLegality::IntegerXor: 983 case LoopVectorizationLegality::IntegerAdd: 984 case LoopVectorizationLegality::IntegerOr: 985 // Adding, Xoring, Oring zero to a number does not change it. 986 return 0; 987 case LoopVectorizationLegality::IntegerMult: 988 // Multiplying a number by 1 does not change it. 989 return 1; 990 case LoopVectorizationLegality::IntegerAnd: 991 // AND-ing a number with an all-1 value does not change it. 992 return -1; 993 default: 994 llvm_unreachable("Unknown reduction kind"); 995 } 996} 997 998void 999SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1000 //===------------------------------------------------===// 1001 // 1002 // Notice: any optimization or new instruction that go 1003 // into the code below should be also be implemented in 1004 // the cost-model. 1005 // 1006 //===------------------------------------------------===// 1007 typedef SmallVector<PHINode*, 4> PhiVector; 1008 BasicBlock &BB = *OrigLoop->getHeader(); 1009 Constant *Zero = ConstantInt::get( 1010 IntegerType::getInt32Ty(BB.getContext()), 0); 1011 1012 // In order to support reduction variables we need to be able to vectorize 1013 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1014 // steages. First, we create a new vector PHI node with no incoming edges. 1015 // We use this value when we vectorize all of the instructions that use the 1016 // PHI. Next, after all of the instructions in the block are complete we 1017 // add the new incoming edges to the PHI. At this point all of the 1018 // instructions in the basic block are vectorized, so we can use them to 1019 // construct the PHI. 1020 PhiVector RdxPHIsToFix; 1021 1022 // For each instruction in the old loop. 1023 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1024 Instruction *Inst = it; 1025 1026 switch (Inst->getOpcode()) { 1027 case Instruction::Br: 1028 // Nothing to do for PHIs and BR, since we already took care of the 1029 // loop control flow instructions. 1030 continue; 1031 case Instruction::PHI:{ 1032 PHINode* P = cast<PHINode>(Inst); 1033 // Handle reduction variables: 1034 if (Legal->getReductionVars()->count(P)) { 1035 // This is phase one of vectorizing PHIs. 1036 Type *VecTy = VectorType::get(Inst->getType(), VF); 1037 WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", 1038 LoopVectorBody->getFirstInsertionPt()); 1039 RdxPHIsToFix.push_back(P); 1040 continue; 1041 } 1042 1043 // This PHINode must be an induction variable. 1044 // Make sure that we know about it. 1045 assert(Legal->getInductionVars()->count(P) && 1046 "Not an induction variable"); 1047 1048 if (P->getType()->isIntegerTy()) { 1049 assert(P == OldInduction && "Unexpected PHI"); 1050 WidenMap[Inst] = getBroadcastInstrs(Induction); 1051 continue; 1052 } 1053 1054 // Handle pointer inductions: 1055 assert(P->getType()->isPointerTy() && "Unexpected type."); 1056 Value *StartIdx = OldInduction ? 1057 Legal->getInductionVars()->lookup(OldInduction) : 1058 ConstantInt::get(Induction->getType(), 0); 1059 1060 // This is the pointer value coming into the loop. 1061 Value *StartPtr = Legal->getInductionVars()->lookup(P); 1062 1063 // This is the normalized GEP that starts counting at zero. 1064 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1065 "normalized.idx"); 1066 1067 // This is the vector of results. Notice that we don't generate vector 1068 // geps because scalar geps result in better code. 1069 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1070 for (unsigned int i = 0; i < VF; ++i) { 1071 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1072 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1073 Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); 1074 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1075 Builder.getInt32(i), 1076 "insert.gep"); 1077 } 1078 1079 WidenMap[Inst] = VecVal; 1080 continue; 1081 } 1082 case Instruction::Add: 1083 case Instruction::FAdd: 1084 case Instruction::Sub: 1085 case Instruction::FSub: 1086 case Instruction::Mul: 1087 case Instruction::FMul: 1088 case Instruction::UDiv: 1089 case Instruction::SDiv: 1090 case Instruction::FDiv: 1091 case Instruction::URem: 1092 case Instruction::SRem: 1093 case Instruction::FRem: 1094 case Instruction::Shl: 1095 case Instruction::LShr: 1096 case Instruction::AShr: 1097 case Instruction::And: 1098 case Instruction::Or: 1099 case Instruction::Xor: { 1100 // Just widen binops. 1101 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 1102 Value *A = getVectorValue(Inst->getOperand(0)); 1103 Value *B = getVectorValue(Inst->getOperand(1)); 1104 1105 // Use this vector value for all users of the original instruction. 1106 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1107 WidenMap[Inst] = V; 1108 1109 // Update the NSW, NUW and Exact flags. 1110 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1111 if (isa<OverflowingBinaryOperator>(BinOp)) { 1112 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1113 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1114 } 1115 if (isa<PossiblyExactOperator>(VecOp)) 1116 VecOp->setIsExact(BinOp->isExact()); 1117 break; 1118 } 1119 case Instruction::Select: { 1120 // Widen selects. 1121 // If the selector is loop invariant we can create a select 1122 // instruction with a scalar condition. Otherwise, use vector-select. 1123 Value *Cond = Inst->getOperand(0); 1124 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1125 1126 // The condition can be loop invariant but still defined inside the 1127 // loop. This means that we can't just use the original 'cond' value. 1128 // We have to take the 'vectorized' value and pick the first lane. 1129 // Instcombine will make this a no-op. 1130 Cond = getVectorValue(Cond); 1131 if (InvariantCond) 1132 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1133 1134 Value *Op0 = getVectorValue(Inst->getOperand(1)); 1135 Value *Op1 = getVectorValue(Inst->getOperand(2)); 1136 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 1137 break; 1138 } 1139 1140 case Instruction::ICmp: 1141 case Instruction::FCmp: { 1142 // Widen compares. Generate vector compares. 1143 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 1144 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 1145 Value *A = getVectorValue(Inst->getOperand(0)); 1146 Value *B = getVectorValue(Inst->getOperand(1)); 1147 if (FCmp) 1148 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1149 else 1150 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1151 break; 1152 } 1153 1154 case Instruction::Store: { 1155 // Attempt to issue a wide store. 1156 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1157 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1158 Value *Ptr = SI->getPointerOperand(); 1159 unsigned Alignment = SI->getAlignment(); 1160 1161 assert(!Legal->isUniform(Ptr) && 1162 "We do not allow storing to uniform addresses"); 1163 1164 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1165 1166 // This store does not use GEPs. 1167 if (!Legal->isConsecutivePtr(Ptr)) { 1168 scalarizeInstruction(Inst); 1169 break; 1170 } 1171 1172 if (Gep) { 1173 // The last index does not have to be the induction. It can be 1174 // consecutive and be a function of the index. For example A[I+1]; 1175 unsigned NumOperands = Gep->getNumOperands(); 1176 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1177 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1178 1179 // Create the new GEP with the new induction variable. 1180 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1181 Gep2->setOperand(NumOperands - 1, LastIndex); 1182 Ptr = Builder.Insert(Gep2); 1183 } else { 1184 // Use the induction element ptr. 1185 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1186 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1187 } 1188 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1189 Value *Val = getVectorValue(SI->getValueOperand()); 1190 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1191 break; 1192 } 1193 case Instruction::Load: { 1194 // Attempt to issue a wide load. 1195 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1196 Type *RetTy = VectorType::get(LI->getType(), VF); 1197 Value *Ptr = LI->getPointerOperand(); 1198 unsigned Alignment = LI->getAlignment(); 1199 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1200 1201 // If the pointer is loop invariant or if it is non consecutive, 1202 // scalarize the load. 1203 bool Con = Legal->isConsecutivePtr(Ptr); 1204 if (Legal->isUniform(Ptr) || !Con) { 1205 scalarizeInstruction(Inst); 1206 break; 1207 } 1208 1209 if (Gep) { 1210 // The last index does not have to be the induction. It can be 1211 // consecutive and be a function of the index. For example A[I+1]; 1212 unsigned NumOperands = Gep->getNumOperands(); 1213 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1214 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1215 1216 // Create the new GEP with the new induction variable. 1217 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1218 Gep2->setOperand(NumOperands - 1, LastIndex); 1219 Ptr = Builder.Insert(Gep2); 1220 } else { 1221 // Use the induction element ptr. 1222 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1223 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1224 } 1225 1226 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1227 LI = Builder.CreateLoad(Ptr); 1228 LI->setAlignment(Alignment); 1229 // Use this vector value for all users of the load. 1230 WidenMap[Inst] = LI; 1231 break; 1232 } 1233 case Instruction::ZExt: 1234 case Instruction::SExt: 1235 case Instruction::FPToUI: 1236 case Instruction::FPToSI: 1237 case Instruction::FPExt: 1238 case Instruction::PtrToInt: 1239 case Instruction::IntToPtr: 1240 case Instruction::SIToFP: 1241 case Instruction::UIToFP: 1242 case Instruction::Trunc: 1243 case Instruction::FPTrunc: 1244 case Instruction::BitCast: { 1245 /// Vectorize bitcasts. 1246 CastInst *CI = dyn_cast<CastInst>(Inst); 1247 Value *A = getVectorValue(Inst->getOperand(0)); 1248 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1249 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1250 break; 1251 } 1252 1253 default: 1254 /// All other instructions are unsupported. Scalarize them. 1255 scalarizeInstruction(Inst); 1256 break; 1257 }// end of switch. 1258 }// end of for_each instr. 1259 1260 // At this point every instruction in the original loop is widended to 1261 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1262 // that we vectorized. The PHI nodes are currently empty because we did 1263 // not want to introduce cycles. Notice that the remaining PHI nodes 1264 // that we need to fix are reduction variables. 1265 1266 // Create the 'reduced' values for each of the induction vars. 1267 // The reduced values are the vector values that we scalarize and combine 1268 // after the loop is finished. 1269 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1270 it != e; ++it) { 1271 PHINode *RdxPhi = *it; 1272 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1273 assert(RdxPhi && "Unable to recover vectorized PHI"); 1274 1275 // Find the reduction variable descriptor. 1276 assert(Legal->getReductionVars()->count(RdxPhi) && 1277 "Unable to find the reduction variable"); 1278 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1279 (*Legal->getReductionVars())[RdxPhi]; 1280 1281 // We need to generate a reduction vector from the incoming scalar. 1282 // To do so, we need to generate the 'identity' vector and overide 1283 // one of the elements with the incoming scalar reduction. We need 1284 // to do it in the vector-loop preheader. 1285 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1286 1287 // This is the vector-clone of the value that leaves the loop. 1288 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1289 Type *VecTy = VectorExit->getType(); 1290 1291 // Find the reduction identity variable. Zero for addition, or, xor, 1292 // one for multiplication, -1 for And. 1293 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1294 VecTy->getScalarType()); 1295 1296 // This vector is the Identity vector where the first element is the 1297 // incoming scalar reduction. 1298 Value *VectorStart = Builder.CreateInsertElement(Identity, 1299 RdxDesc.StartValue, Zero); 1300 1301 // Fix the vector-loop phi. 1302 // We created the induction variable so we know that the 1303 // preheader is the first entry. 1304 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1305 1306 // Reductions do not have to start at zero. They can start with 1307 // any loop invariant values. 1308 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1309 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1310 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1311 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1312 1313 // Before each round, move the insertion point right between 1314 // the PHIs and the values we are going to write. 1315 // This allows us to write both PHINodes and the extractelement 1316 // instructions. 1317 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1318 1319 // This PHINode contains the vectorized reduction variable, or 1320 // the initial value vector, if we bypass the vector loop. 1321 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1322 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1323 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1324 1325 // Extract the first scalar. 1326 Value *Scalar0 = 1327 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1328 // Extract and reduce the remaining vector elements. 1329 for (unsigned i=1; i < VF; ++i) { 1330 Value *Scalar1 = 1331 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1332 switch (RdxDesc.Kind) { 1333 case LoopVectorizationLegality::IntegerAdd: 1334 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1335 break; 1336 case LoopVectorizationLegality::IntegerMult: 1337 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1338 break; 1339 case LoopVectorizationLegality::IntegerOr: 1340 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1341 break; 1342 case LoopVectorizationLegality::IntegerAnd: 1343 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1344 break; 1345 case LoopVectorizationLegality::IntegerXor: 1346 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1347 break; 1348 default: 1349 llvm_unreachable("Unknown reduction operation"); 1350 } 1351 } 1352 1353 // Now, we need to fix the users of the reduction variable 1354 // inside and outside of the scalar remainder loop. 1355 // We know that the loop is in LCSSA form. We need to update the 1356 // PHI nodes in the exit blocks. 1357 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1358 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1359 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1360 if (!LCSSAPhi) continue; 1361 1362 // All PHINodes need to have a single entry edge, or two if 1363 // we already fixed them. 1364 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1365 1366 // We found our reduction value exit-PHI. Update it with the 1367 // incoming bypass edge. 1368 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1369 // Add an edge coming from the bypass. 1370 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1371 break; 1372 } 1373 }// end of the LCSSA phi scan. 1374 1375 // Fix the scalar loop reduction variable with the incoming reduction sum 1376 // from the vector body and from the backedge value. 1377 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1378 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1379 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1380 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1381 }// end of for each redux variable. 1382} 1383 1384void SingleBlockLoopVectorizer::updateAnalysis() { 1385 // The original basic block. 1386 SE->forgetLoop(OrigLoop); 1387 1388 // Update the dominator tree information. 1389 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1390 "Entry does not dominate exit."); 1391 1392 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1393 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1394 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1395 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1396 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1397 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1398 1399 DEBUG(DT->verifyAnalysis()); 1400} 1401 1402bool LoopVectorizationLegality::canVectorize() { 1403 if (!TheLoop->getLoopPreheader()) { 1404 assert(false && "No preheader!!"); 1405 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1406 return false; 1407 } 1408 1409 // We can only vectorize single basic block loops. 1410 unsigned NumBlocks = TheLoop->getNumBlocks(); 1411 if (NumBlocks != 1) { 1412 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1413 return false; 1414 } 1415 1416 // We need to have a loop header. 1417 BasicBlock *BB = TheLoop->getHeader(); 1418 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1419 1420 // ScalarEvolution needs to be able to find the exit count. 1421 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1422 if (ExitCount == SE->getCouldNotCompute()) { 1423 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1424 return false; 1425 } 1426 1427 // Do not loop-vectorize loops with a tiny trip count. 1428 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1429 if (TC > 0u && TC < TinyTripCountThreshold) { 1430 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1431 "This loop is not worth vectorizing.\n"); 1432 return false; 1433 } 1434 1435 // Go over each instruction and look at memory deps. 1436 if (!canVectorizeBlock(*BB)) { 1437 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1438 return false; 1439 } 1440 1441 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1442 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1443 <<"!\n"); 1444 1445 // Okay! We can vectorize. At this point we don't have any other mem analysis 1446 // which may limit our maximum vectorization factor, so just return true with 1447 // no restrictions. 1448 return true; 1449} 1450 1451bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1452 1453 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1454 1455 // Scan the instructions in the block and look for hazards. 1456 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1457 Instruction *I = it; 1458 1459 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 1460 // This should not happen because the loop should be normalized. 1461 if (Phi->getNumIncomingValues() != 2) { 1462 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1463 return false; 1464 } 1465 1466 // This is the value coming from the preheader. 1467 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1468 1469 // We only look at integer and pointer phi nodes. 1470 if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { 1471 DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); 1472 Inductions[Phi] = StartValue; 1473 continue; 1474 } else if (!Phi->getType()->isIntegerTy()) { 1475 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1476 return false; 1477 } 1478 1479 // Handle integer PHIs: 1480 if (isInductionVariable(Phi)) { 1481 if (Induction) { 1482 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1483 return false; 1484 } 1485 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1486 Induction = Phi; 1487 Inductions[Phi] = StartValue; 1488 continue; 1489 } 1490 if (AddReductionVar(Phi, IntegerAdd)) { 1491 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1492 continue; 1493 } 1494 if (AddReductionVar(Phi, IntegerMult)) { 1495 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1496 continue; 1497 } 1498 if (AddReductionVar(Phi, IntegerOr)) { 1499 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1500 continue; 1501 } 1502 if (AddReductionVar(Phi, IntegerAnd)) { 1503 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1504 continue; 1505 } 1506 if (AddReductionVar(Phi, IntegerXor)) { 1507 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1508 continue; 1509 } 1510 1511 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1512 return false; 1513 }// end of PHI handling 1514 1515 // We still don't handle functions. 1516 CallInst *CI = dyn_cast<CallInst>(I); 1517 if (CI) { 1518 DEBUG(dbgs() << "LV: Found a call site.\n"); 1519 return false; 1520 } 1521 1522 // We do not re-vectorize vectors. 1523 if (!VectorType::isValidElementType(I->getType()) && 1524 !I->getType()->isVoidTy()) { 1525 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1526 return false; 1527 } 1528 1529 // Reduction instructions are allowed to have exit users. 1530 // All other instructions must not have external users. 1531 if (!AllowedExit.count(I)) 1532 //Check that all of the users of the loop are inside the BB. 1533 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1534 it != e; ++it) { 1535 Instruction *U = cast<Instruction>(*it); 1536 // This user may be a reduction exit value. 1537 BasicBlock *Parent = U->getParent(); 1538 if (Parent != &BB) { 1539 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1540 return false; 1541 } 1542 } 1543 } // next instr. 1544 1545 if (!Induction) { 1546 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1547 assert(getInductionVars()->size() && "No induction variables"); 1548 } 1549 1550 // Don't vectorize if the memory dependencies do not allow vectorization. 1551 if (!canVectorizeMemory(BB)) 1552 return false; 1553 1554 // We now know that the loop is vectorizable! 1555 // Collect variables that will remain uniform after vectorization. 1556 std::vector<Value*> Worklist; 1557 1558 // Start with the conditional branch and walk up the block. 1559 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1560 1561 while (Worklist.size()) { 1562 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1563 Worklist.pop_back(); 1564 1565 // Look at instructions inside this block. Stop when reaching PHI nodes. 1566 if (!I || I->getParent() != &BB || isa<PHINode>(I)) 1567 continue; 1568 1569 // This is a known uniform. 1570 Uniforms.insert(I); 1571 1572 // Insert all operands. 1573 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1574 Worklist.push_back(I->getOperand(i)); 1575 } 1576 } 1577 1578 return true; 1579} 1580 1581bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1582 typedef SmallVector<Value*, 16> ValueVector; 1583 typedef SmallPtrSet<Value*, 16> ValueSet; 1584 // Holds the Load and Store *instructions*. 1585 ValueVector Loads; 1586 ValueVector Stores; 1587 PtrRtCheck.Pointers.clear(); 1588 PtrRtCheck.Need = false; 1589 1590 // Scan the BB and collect legal loads and stores. 1591 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1592 Instruction *I = it; 1593 1594 // If this is a load, save it. If this instruction can read from memory 1595 // but is not a load, then we quit. Notice that we don't handle function 1596 // calls that read or write. 1597 if (I->mayReadFromMemory()) { 1598 LoadInst *Ld = dyn_cast<LoadInst>(I); 1599 if (!Ld) return false; 1600 if (!Ld->isSimple()) { 1601 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1602 return false; 1603 } 1604 Loads.push_back(Ld); 1605 continue; 1606 } 1607 1608 // Save store instructions. Abort if other instructions write to memory. 1609 if (I->mayWriteToMemory()) { 1610 StoreInst *St = dyn_cast<StoreInst>(I); 1611 if (!St) return false; 1612 if (!St->isSimple()) { 1613 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1614 return false; 1615 } 1616 Stores.push_back(St); 1617 } 1618 } // next instr. 1619 1620 // Now we have two lists that hold the loads and the stores. 1621 // Next, we find the pointers that they use. 1622 1623 // Check if we see any stores. If there are no stores, then we don't 1624 // care if the pointers are *restrict*. 1625 if (!Stores.size()) { 1626 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1627 return true; 1628 } 1629 1630 // Holds the read and read-write *pointers* that we find. 1631 ValueVector Reads; 1632 ValueVector ReadWrites; 1633 1634 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1635 // multiple times on the same object. If the ptr is accessed twice, once 1636 // for read and once for write, it will only appear once (on the write 1637 // list). This is okay, since we are going to check for conflicts between 1638 // writes and between reads and writes, but not between reads and reads. 1639 ValueSet Seen; 1640 1641 ValueVector::iterator I, IE; 1642 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1643 StoreInst *ST = dyn_cast<StoreInst>(*I); 1644 assert(ST && "Bad StoreInst"); 1645 Value* Ptr = ST->getPointerOperand(); 1646 1647 if (isUniform(Ptr)) { 1648 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1649 return false; 1650 } 1651 1652 // If we did *not* see this pointer before, insert it to 1653 // the read-write list. At this phase it is only a 'write' list. 1654 if (Seen.insert(Ptr)) 1655 ReadWrites.push_back(Ptr); 1656 } 1657 1658 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1659 LoadInst *LD = dyn_cast<LoadInst>(*I); 1660 assert(LD && "Bad LoadInst"); 1661 Value* Ptr = LD->getPointerOperand(); 1662 // If we did *not* see this pointer before, insert it to the 1663 // read list. If we *did* see it before, then it is already in 1664 // the read-write list. This allows us to vectorize expressions 1665 // such as A[i] += x; Because the address of A[i] is a read-write 1666 // pointer. This only works if the index of A[i] is consecutive. 1667 // If the address of i is unknown (for example A[B[i]]) then we may 1668 // read a few words, modify, and write a few words, and some of the 1669 // words may be written to the same address. 1670 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1671 Reads.push_back(Ptr); 1672 } 1673 1674 // If we write (or read-write) to a single destination and there are no 1675 // other reads in this loop then is it safe to vectorize. 1676 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1677 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1678 return true; 1679 } 1680 1681 // Find pointers with computable bounds. We are going to use this information 1682 // to place a runtime bound check. 1683 bool RT = true; 1684 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1685 if (hasComputableBounds(*I)) { 1686 PtrRtCheck.insert(SE, TheLoop, *I); 1687 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1688 } else { 1689 RT = false; 1690 break; 1691 } 1692 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1693 if (hasComputableBounds(*I)) { 1694 PtrRtCheck.insert(SE, TheLoop, *I); 1695 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1696 } else { 1697 RT = false; 1698 break; 1699 } 1700 1701 // Check that we did not collect too many pointers or found a 1702 // unsizeable pointer. 1703 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1704 PtrRtCheck.reset(); 1705 RT = false; 1706 } 1707 1708 PtrRtCheck.Need = RT; 1709 1710 if (RT) { 1711 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1712 } 1713 1714 // Now that the pointers are in two lists (Reads and ReadWrites), we 1715 // can check that there are no conflicts between each of the writes and 1716 // between the writes to the reads. 1717 ValueSet WriteObjects; 1718 ValueVector TempObjects; 1719 1720 // Check that the read-writes do not conflict with other read-write 1721 // pointers. 1722 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1723 GetUnderlyingObjects(*I, TempObjects, DL); 1724 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1725 it != e; ++it) { 1726 if (!isIdentifiedObject(*it)) { 1727 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1728 return RT; 1729 } 1730 if (!WriteObjects.insert(*it)) { 1731 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1732 << **it <<"\n"); 1733 return RT; 1734 } 1735 } 1736 TempObjects.clear(); 1737 } 1738 1739 /// Check that the reads don't conflict with the read-writes. 1740 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1741 GetUnderlyingObjects(*I, TempObjects, DL); 1742 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1743 it != e; ++it) { 1744 if (!isIdentifiedObject(*it)) { 1745 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1746 return RT; 1747 } 1748 if (WriteObjects.count(*it)) { 1749 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1750 << **it <<"\n"); 1751 return RT; 1752 } 1753 } 1754 TempObjects.clear(); 1755 } 1756 1757 // It is safe to vectorize and we don't need any runtime checks. 1758 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1759 PtrRtCheck.reset(); 1760 return true; 1761} 1762 1763bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1764 ReductionKind Kind) { 1765 if (Phi->getNumIncomingValues() != 2) 1766 return false; 1767 1768 // Find the possible incoming reduction variable. 1769 BasicBlock *BB = Phi->getParent(); 1770 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1771 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1772 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1773 1774 // ExitInstruction is the single value which is used outside the loop. 1775 // We only allow for a single reduction value to be used outside the loop. 1776 // This includes users of the reduction, variables (which form a cycle 1777 // which ends in the phi node). 1778 Instruction *ExitInstruction = 0; 1779 1780 // Iter is our iterator. We start with the PHI node and scan for all of the 1781 // users of this instruction. All users must be instructions which can be 1782 // used as reduction variables (such as ADD). We may have a single 1783 // out-of-block user. They cycle must end with the original PHI. 1784 // Also, we can't have multiple block-local users. 1785 Instruction *Iter = Phi; 1786 while (true) { 1787 // Any reduction instr must be of one of the allowed kinds. 1788 if (!isReductionInstr(Iter, Kind)) 1789 return false; 1790 1791 // Did we found a user inside this block ? 1792 bool FoundInBlockUser = false; 1793 // Did we reach the initial PHI node ? 1794 bool FoundStartPHI = false; 1795 1796 // If the instruction has no users then this is a broken 1797 // chain and can't be a reduction variable. 1798 if (Iter->use_empty()) 1799 return false; 1800 1801 // For each of the *users* of iter. 1802 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1803 it != e; ++it) { 1804 Instruction *U = cast<Instruction>(*it); 1805 // We already know that the PHI is a user. 1806 if (U == Phi) { 1807 FoundStartPHI = true; 1808 continue; 1809 } 1810 // Check if we found the exit user. 1811 BasicBlock *Parent = U->getParent(); 1812 if (Parent != BB) { 1813 // We must have a single exit instruction. 1814 if (ExitInstruction != 0) 1815 return false; 1816 ExitInstruction = Iter; 1817 } 1818 // We can't have multiple inside users. 1819 if (FoundInBlockUser) 1820 return false; 1821 FoundInBlockUser = true; 1822 Iter = U; 1823 } 1824 1825 // We found a reduction var if we have reached the original 1826 // phi node and we only have a single instruction with out-of-loop 1827 // users. 1828 if (FoundStartPHI && ExitInstruction) { 1829 // This instruction is allowed to have out-of-loop users. 1830 AllowedExit.insert(ExitInstruction); 1831 1832 // Save the description of this reduction variable. 1833 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1834 Reductions[Phi] = RD; 1835 return true; 1836 } 1837 } 1838} 1839 1840bool 1841LoopVectorizationLegality::isReductionInstr(Instruction *I, 1842 ReductionKind Kind) { 1843 switch (I->getOpcode()) { 1844 default: 1845 return false; 1846 case Instruction::PHI: 1847 // possibly. 1848 return true; 1849 case Instruction::Add: 1850 case Instruction::Sub: 1851 return Kind == IntegerAdd; 1852 case Instruction::Mul: 1853 return Kind == IntegerMult; 1854 case Instruction::And: 1855 return Kind == IntegerAnd; 1856 case Instruction::Or: 1857 return Kind == IntegerOr; 1858 case Instruction::Xor: 1859 return Kind == IntegerXor; 1860 } 1861} 1862 1863bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1864 Type *PhiTy = Phi->getType(); 1865 // We only handle integer and pointer inductions variables. 1866 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1867 return false; 1868 1869 // Check that the PHI is consecutive and starts at zero. 1870 const SCEV *PhiScev = SE->getSCEV(Phi); 1871 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1872 if (!AR) { 1873 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1874 return false; 1875 } 1876 const SCEV *Step = AR->getStepRecurrence(*SE); 1877 1878 // Integer inductions need to have a stride of one. 1879 if (PhiTy->isIntegerTy()) 1880 return Step->isOne(); 1881 1882 // Calculate the pointer stride and check if it is consecutive. 1883 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1884 if (!C) return false; 1885 1886 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1887 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1888 return (C->getValue()->equalsInt(Size)); 1889} 1890 1891bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1892 const SCEV *PhiScev = SE->getSCEV(Ptr); 1893 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1894 if (!AR) 1895 return false; 1896 1897 return AR->isAffine(); 1898} 1899 1900unsigned 1901LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1902 if (!VTTI) { 1903 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1904 return 1; 1905 } 1906 1907 float Cost = expectedCost(1); 1908 unsigned Width = 1; 1909 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1910 for (unsigned i=2; i <= VF; i*=2) { 1911 // Notice that the vector loop needs to be executed less times, so 1912 // we need to divide the cost of the vector loops by the width of 1913 // the vector elements. 1914 float VectorCost = expectedCost(i) / (float)i; 1915 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1916 (int)VectorCost << ".\n"); 1917 if (VectorCost < Cost) { 1918 Cost = VectorCost; 1919 Width = i; 1920 } 1921 } 1922 1923 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1924 return Width; 1925} 1926 1927unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1928 // We can only estimate the cost of single basic block loops. 1929 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1930 1931 BasicBlock *BB = TheLoop->getHeader(); 1932 unsigned Cost = 0; 1933 1934 // For each instruction in the old loop. 1935 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1936 Instruction *Inst = it; 1937 unsigned C = getInstructionCost(Inst, VF); 1938 Cost += C; 1939 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1940 " For instruction: "<< *Inst << "\n"); 1941 } 1942 1943 return Cost; 1944} 1945 1946unsigned 1947LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1948 assert(VTTI && "Invalid vector target transformation info"); 1949 1950 // If we know that this instruction will remain uniform, check the cost of 1951 // the scalar version. 1952 if (Legal->isUniformAfterVectorization(I)) 1953 VF = 1; 1954 1955 Type *RetTy = I->getType(); 1956 Type *VectorTy = ToVectorTy(RetTy, VF); 1957 1958 1959 // TODO: We need to estimate the cost of intrinsic calls. 1960 switch (I->getOpcode()) { 1961 case Instruction::GetElementPtr: 1962 // We mark this instruction as zero-cost because scalar GEPs are usually 1963 // lowered to the intruction addressing mode. At the moment we don't 1964 // generate vector geps. 1965 return 0; 1966 case Instruction::Br: { 1967 return VTTI->getCFInstrCost(I->getOpcode()); 1968 } 1969 case Instruction::PHI: 1970 return 0; 1971 case Instruction::Add: 1972 case Instruction::FAdd: 1973 case Instruction::Sub: 1974 case Instruction::FSub: 1975 case Instruction::Mul: 1976 case Instruction::FMul: 1977 case Instruction::UDiv: 1978 case Instruction::SDiv: 1979 case Instruction::FDiv: 1980 case Instruction::URem: 1981 case Instruction::SRem: 1982 case Instruction::FRem: 1983 case Instruction::Shl: 1984 case Instruction::LShr: 1985 case Instruction::AShr: 1986 case Instruction::And: 1987 case Instruction::Or: 1988 case Instruction::Xor: { 1989 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1990 } 1991 case Instruction::Select: { 1992 SelectInst *SI = cast<SelectInst>(I); 1993 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1994 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1995 Type *CondTy = SI->getCondition()->getType(); 1996 if (ScalarCond) 1997 CondTy = VectorType::get(CondTy, VF); 1998 1999 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2000 } 2001 case Instruction::ICmp: 2002 case Instruction::FCmp: { 2003 Type *ValTy = I->getOperand(0)->getType(); 2004 VectorTy = ToVectorTy(ValTy, VF); 2005 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2006 } 2007 case Instruction::Store: { 2008 StoreInst *SI = cast<StoreInst>(I); 2009 Type *ValTy = SI->getValueOperand()->getType(); 2010 VectorTy = ToVectorTy(ValTy, VF); 2011 2012 if (VF == 1) 2013 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 2014 SI->getAlignment(), SI->getPointerAddressSpace()); 2015 2016 // Scalarized stores. 2017 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2018 unsigned Cost = 0; 2019 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2020 ValTy); 2021 // The cost of extracting from the value vector. 2022 Cost += VF * (ExtCost); 2023 // The cost of the scalar stores. 2024 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2025 ValTy->getScalarType(), 2026 SI->getAlignment(), 2027 SI->getPointerAddressSpace()); 2028 return Cost; 2029 } 2030 2031 // Wide stores. 2032 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2033 SI->getPointerAddressSpace()); 2034 } 2035 case Instruction::Load: { 2036 LoadInst *LI = cast<LoadInst>(I); 2037 2038 if (VF == 1) 2039 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2040 LI->getAlignment(), 2041 LI->getPointerAddressSpace()); 2042 2043 // Scalarized loads. 2044 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2045 unsigned Cost = 0; 2046 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2047 // The cost of inserting the loaded value into the result vector. 2048 Cost += VF * (InCost); 2049 // The cost of the scalar stores. 2050 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2051 RetTy->getScalarType(), 2052 LI->getAlignment(), 2053 LI->getPointerAddressSpace()); 2054 return Cost; 2055 } 2056 2057 // Wide loads. 2058 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2059 LI->getPointerAddressSpace()); 2060 } 2061 case Instruction::ZExt: 2062 case Instruction::SExt: 2063 case Instruction::FPToUI: 2064 case Instruction::FPToSI: 2065 case Instruction::FPExt: 2066 case Instruction::PtrToInt: 2067 case Instruction::IntToPtr: 2068 case Instruction::SIToFP: 2069 case Instruction::UIToFP: 2070 case Instruction::Trunc: 2071 case Instruction::FPTrunc: 2072 case Instruction::BitCast: { 2073 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2074 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2075 } 2076 default: { 2077 // We are scalarizing the instruction. Return the cost of the scalar 2078 // instruction, plus the cost of insert and extract into vector 2079 // elements, times the vector width. 2080 unsigned Cost = 0; 2081 2082 bool IsVoid = RetTy->isVoidTy(); 2083 2084 unsigned InsCost = (IsVoid ? 0 : 2085 VTTI->getInstrCost(Instruction::InsertElement, 2086 VectorTy)); 2087 2088 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2089 VectorTy); 2090 2091 // The cost of inserting the results plus extracting each one of the 2092 // operands. 2093 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2094 2095 // The cost of executing VF copies of the scalar instruction. 2096 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2097 return Cost; 2098 } 2099 }// end of switch. 2100} 2101 2102Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2103 if (Scalar->isVoidTy() || VF == 1) 2104 return Scalar; 2105 return VectorType::get(Scalar, VF); 2106} 2107 2108} // namespace 2109 2110char LoopVectorize::ID = 0; 2111static const char lv_name[] = "Loop Vectorization"; 2112INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2113INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2114INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2115INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2116INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2117 2118namespace llvm { 2119 Pass *createLoopVectorizePass() { 2120 return new LoopVectorize(); 2121 } 2122} 2123 2124