LoopVectorize.cpp revision d7e0b7cdc59d04bec6cf5273cde3d2a16b13800e
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. Legalization of the IR is done 12// in the codegen. However, the vectorizes uses (will use) the codegen 13// interfaces to generate IR that is likely to result in an optimal binary. 14// 15// The loop vectorizer combines consecutive loop iteration into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. SingleBlockLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28// 29//===----------------------------------------------------------------------===// 30// 31// The reduction-variable vectorization is based on the paper: 32// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33// 34// Variable uniformity checks are inspired by: 35// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36// 37// Other ideas/concepts are from: 38// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39// 40// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41// Vectorizing Compilers. 42// 43//===----------------------------------------------------------------------===// 44#define LV_NAME "loop-vectorize" 45#define DEBUG_TYPE LV_NAME 46#include "llvm/Constants.h" 47#include "llvm/DerivedTypes.h" 48#include "llvm/Instructions.h" 49#include "llvm/LLVMContext.h" 50#include "llvm/Pass.h" 51#include "llvm/Analysis/LoopPass.h" 52#include "llvm/Value.h" 53#include "llvm/Function.h" 54#include "llvm/Analysis/Verifier.h" 55#include "llvm/Module.h" 56#include "llvm/Type.h" 57#include "llvm/ADT/SmallVector.h" 58#include "llvm/ADT/StringExtras.h" 59#include "llvm/Analysis/AliasAnalysis.h" 60#include "llvm/Analysis/AliasSetTracker.h" 61#include "llvm/Analysis/ScalarEvolution.h" 62#include "llvm/Analysis/Dominators.h" 63#include "llvm/Analysis/ScalarEvolutionExpressions.h" 64#include "llvm/Analysis/ScalarEvolutionExpander.h" 65#include "llvm/Analysis/LoopInfo.h" 66#include "llvm/Analysis/ValueTracking.h" 67#include "llvm/Transforms/Scalar.h" 68#include "llvm/Transforms/Utils/BasicBlockUtils.h" 69#include "llvm/TargetTransformInfo.h" 70#include "llvm/Support/CommandLine.h" 71#include "llvm/Support/Debug.h" 72#include "llvm/Support/raw_ostream.h" 73#include "llvm/DataLayout.h" 74#include "llvm/Transforms/Utils/Local.h" 75#include <algorithm> 76using namespace llvm; 77 78static cl::opt<unsigned> 79VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 80 cl::desc("Set the default vectorization width. Zero is autoselect.")); 81 82/// We don't vectorize loops with a known constant trip count below this number. 83const unsigned TinyTripCountThreshold = 16; 84 85/// When performing a runtime memory check, do not check more than this 86/// number of pointers. Notice that the check is quadratic! 87const unsigned RuntimeMemoryCheckThreshold = 2; 88 89namespace { 90 91// Forward declarations. 92class LoopVectorizationLegality; 93class LoopVectorizationCostModel; 94 95/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 96/// block to a specified vectorization factor (VF). 97/// This class performs the widening of scalars into vectors, or multiple 98/// scalars. This class also implements the following features: 99/// * It inserts an epilogue loop for handling loops that don't have iteration 100/// counts that are known to be a multiple of the vectorization factor. 101/// * It handles the code generation for reduction variables. 102/// * Scalarization (implementation using scalars) of un-vectorizable 103/// instructions. 104/// SingleBlockLoopVectorizer does not perform any vectorization-legality 105/// checks, and relies on the caller to check for the different legality 106/// aspects. The SingleBlockLoopVectorizer relies on the 107/// LoopVectorizationLegality class to provide information about the induction 108/// and reduction variables that were found to a given vectorization factor. 109class SingleBlockLoopVectorizer { 110public: 111 /// Ctor. 112 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 113 DominatorTree *dt, DataLayout *dl, 114 LPPassManager *Lpm, 115 unsigned VecWidth): 116 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth), 117 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 118 119 // Perform the actual loop widening (vectorization). 120 void vectorize(LoopVectorizationLegality *Legal) { 121 ///Create a new empty loop. Unlink the old loop and connect the new one. 122 createEmptyLoop(Legal); 123 /// Widen each instruction in the old loop to a new one in the new loop. 124 /// Use the Legality module to find the induction and reduction variables. 125 vectorizeLoop(Legal); 126 // Register the new loop and update the analysis passes. 127 updateAnalysis(); 128 } 129 130private: 131 /// Create an empty loop, based on the loop ranges of the old loop. 132 void createEmptyLoop(LoopVectorizationLegality *Legal); 133 /// Copy and widen the instructions from the old loop. 134 void vectorizeLoop(LoopVectorizationLegality *Legal); 135 /// Insert the new loop to the loop hierarchy and pass manager 136 /// and update the analysis passes. 137 void updateAnalysis(); 138 139 /// This instruction is un-vectorizable. Implement it as a sequence 140 /// of scalars. 141 void scalarizeInstruction(Instruction *Instr); 142 143 /// Create a broadcast instruction. This method generates a broadcast 144 /// instruction (shuffle) for loop invariant values and for the induction 145 /// value. If this is the induction variable then we extend it to N, N+1, ... 146 /// this is needed because each iteration in the loop corresponds to a SIMD 147 /// element. 148 Value *getBroadcastInstrs(Value *V); 149 150 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 151 /// for each element in the vector. Starting from zero. 152 Value *getConsecutiveVector(Value* Val); 153 154 /// When we go over instructions in the basic block we rely on previous 155 /// values within the current basic block or on loop invariant values. 156 /// When we widen (vectorize) values we place them in the map. If the values 157 /// are not within the map, they have to be loop invariant, so we simply 158 /// broadcast them into a vector. 159 Value *getVectorValue(Value *V); 160 161 /// Get a uniform vector of constant integers. We use this to get 162 /// vectors of ones and zeros for the reduction code. 163 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 164 165 typedef DenseMap<Value*, Value*> ValueMap; 166 167 /// The original loop. 168 Loop *OrigLoop; 169 // Scev analysis to use. 170 ScalarEvolution *SE; 171 // Loop Info. 172 LoopInfo *LI; 173 // Dominator Tree. 174 DominatorTree *DT; 175 // Data Layout; 176 DataLayout *DL; 177 // Loop Pass Manager; 178 LPPassManager *LPM; 179 // The vectorization factor to use. 180 unsigned VF; 181 182 // The builder that we use 183 IRBuilder<> Builder; 184 185 // --- Vectorization state --- 186 187 /// The vector-loop preheader. 188 BasicBlock *LoopVectorPreHeader; 189 /// The scalar-loop preheader. 190 BasicBlock *LoopScalarPreHeader; 191 /// Middle Block between the vector and the scalar. 192 BasicBlock *LoopMiddleBlock; 193 ///The ExitBlock of the scalar loop. 194 BasicBlock *LoopExitBlock; 195 ///The vector loop body. 196 BasicBlock *LoopVectorBody; 197 ///The scalar loop body. 198 BasicBlock *LoopScalarBody; 199 ///The first bypass block. 200 BasicBlock *LoopBypassBlock; 201 202 /// The new Induction variable which was added to the new block. 203 PHINode *Induction; 204 /// The induction variable of the old basic block. 205 PHINode *OldInduction; 206 // Maps scalars to widened vectors. 207 ValueMap WidenMap; 208}; 209 210/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 211/// to what vectorization factor. 212/// This class does not look at the profitability of vectorization, only the 213/// legality. This class has two main kinds of checks: 214/// * Memory checks - The code in canVectorizeMemory checks if vectorization 215/// will change the order of memory accesses in a way that will change the 216/// correctness of the program. 217/// * Scalars checks - The code in canVectorizeBlock checks for a number 218/// of different conditions, such as the availability of a single induction 219/// variable, that all types are supported and vectorize-able, etc. 220/// This code reflects the capabilities of SingleBlockLoopVectorizer. 221/// This class is also used by SingleBlockLoopVectorizer for identifying 222/// induction variable and the different reduction variables. 223class LoopVectorizationLegality { 224public: 225 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 226 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 227 228 /// This represents the kinds of reductions that we support. 229 enum ReductionKind { 230 NoReduction, /// Not a reduction. 231 IntegerAdd, /// Sum of numbers. 232 IntegerMult, /// Product of numbers. 233 IntegerOr, /// Bitwise or logical OR of numbers. 234 IntegerAnd, /// Bitwise or logical AND of numbers. 235 IntegerXor /// Bitwise or logical XOR of numbers. 236 }; 237 238 /// This POD struct holds information about reduction variables. 239 struct ReductionDescriptor { 240 // Default C'tor 241 ReductionDescriptor(): 242 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 243 244 // C'tor. 245 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 246 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 247 248 // The starting value of the reduction. 249 // It does not have to be zero! 250 Value *StartValue; 251 // The instruction who's value is used outside the loop. 252 Instruction *LoopExitInstr; 253 // The kind of the reduction. 254 ReductionKind Kind; 255 }; 256 257 // This POD struct holds information about the memory runtime legality 258 // check that a group of pointers do not overlap. 259 struct RuntimePointerCheck { 260 RuntimePointerCheck(): Need(false) {} 261 262 /// Reset the state of the pointer runtime information. 263 void reset() { 264 Need = false; 265 Pointers.clear(); 266 Starts.clear(); 267 Ends.clear(); 268 } 269 270 /// Insert a pointer and calculate the start and end SCEVs. 271 void insert_pointer(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { 272 const SCEV *Sc = SE->getSCEV(Ptr); 273 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 274 assert(AR && "Invalid addrec expression"); 275 const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); 276 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 277 Pointers.push_back(Ptr); 278 Starts.push_back(AR->getStart()); 279 Ends.push_back(ScEnd); 280 } 281 282 /// This flag indicates if we need to add the runtime check. 283 bool Need; 284 /// Holds the pointers that we need to check. 285 SmallVector<Value*, 2> Pointers; 286 /// Holds the pointer value at the beginning of the loop. 287 SmallVector<const SCEV*, 2> Starts; 288 /// Holds the pointer value at the end of the loop. 289 SmallVector<const SCEV*, 2> Ends; 290 }; 291 292 /// ReductionList contains the reduction descriptors for all 293 /// of the reductions that were found in the loop. 294 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 295 296 /// InductionList saves induction variables and maps them to the initial 297 /// value entring the loop. 298 typedef DenseMap<PHINode*, Value*> InductionList; 299 300 /// Returns true if it is legal to vectorize this loop. 301 /// This does not mean that it is profitable to vectorize this 302 /// loop, only that it is legal to do so. 303 bool canVectorize(); 304 305 /// Returns the Induction variable. 306 PHINode *getInduction() {return Induction;} 307 308 /// Returns the reduction variables found in the loop. 309 ReductionList *getReductionVars() { return &Reductions; } 310 311 /// Returns the induction variables found in the loop. 312 InductionList *getInductionVars() { return &Inductions; } 313 314 /// Check if this pointer is consecutive when vectorizing. This happens 315 /// when the last index of the GEP is the induction variable, or that the 316 /// pointer itself is an induction variable. 317 /// This check allows us to vectorize A[idx] into a wide load/store. 318 bool isConsecutivePtr(Value *Ptr); 319 320 /// Returns true if the value V is uniform within the loop. 321 bool isUniform(Value *V); 322 323 /// Returns true if this instruction will remain scalar after vectorization. 324 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 325 326 /// Returns the information that we collected about runtime memory check. 327 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 328private: 329 /// Check if a single basic block loop is vectorizable. 330 /// At this point we know that this is a loop with a constant trip count 331 /// and we only need to check individual instructions. 332 bool canVectorizeBlock(BasicBlock &BB); 333 334 /// When we vectorize loops we may change the order in which 335 /// we read and write from memory. This method checks if it is 336 /// legal to vectorize the code, considering only memory constrains. 337 /// Returns true if BB is vectorizable 338 bool canVectorizeMemory(BasicBlock &BB); 339 340 /// Returns True, if 'Phi' is the kind of reduction variable for type 341 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 342 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 343 /// Returns true if the instruction I can be a reduction variable of type 344 /// 'Kind'. 345 bool isReductionInstr(Instruction *I, ReductionKind Kind); 346 /// Returns True, if 'Phi' is an induction variable. 347 bool isInductionVariable(PHINode *Phi); 348 /// Return true if can compute the address bounds of Ptr within the loop. 349 bool hasComputableBounds(Value *Ptr); 350 351 /// The loop that we evaluate. 352 Loop *TheLoop; 353 /// Scev analysis. 354 ScalarEvolution *SE; 355 /// DataLayout analysis. 356 DataLayout *DL; 357 358 // --- vectorization state --- // 359 360 /// Holds the integer induction variable. This is the counter of the 361 /// loop. 362 PHINode *Induction; 363 /// Holds the reduction variables. 364 ReductionList Reductions; 365 /// Holds all of the induction variables that we found in the loop. 366 /// Notice that inductions don't need to start at zero and that induction 367 /// variables can be pointers. 368 InductionList Inductions; 369 370 /// Allowed outside users. This holds the reduction 371 /// vars which can be accessed from outside the loop. 372 SmallPtrSet<Value*, 4> AllowedExit; 373 /// This set holds the variables which are known to be uniform after 374 /// vectorization. 375 SmallPtrSet<Instruction*, 4> Uniforms; 376 /// We need to check that all of the pointers in this list are disjoint 377 /// at runtime. 378 RuntimePointerCheck PtrRtCheck; 379}; 380 381/// LoopVectorizationCostModel - estimates the expected speedups due to 382/// vectorization. 383/// In many cases vectorization is not profitable. This can happen because 384/// of a number of reasons. In this class we mainly attempt to predict 385/// the expected speedup/slowdowns due to the supported instruction set. 386/// We use the VectorTargetTransformInfo to query the different backends 387/// for the cost of different operations. 388class LoopVectorizationCostModel { 389public: 390 /// C'tor. 391 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 392 LoopVectorizationLegality *Leg, 393 const VectorTargetTransformInfo *Vtti): 394 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 395 396 /// Returns the most profitable vectorization factor for the loop that is 397 /// smaller or equal to the VF argument. This method checks every power 398 /// of two up to VF. 399 unsigned findBestVectorizationFactor(unsigned VF = 8); 400 401private: 402 /// Returns the expected execution cost. The unit of the cost does 403 /// not matter because we use the 'cost' units to compare different 404 /// vector widths. The cost that is returned is *not* normalized by 405 /// the factor width. 406 unsigned expectedCost(unsigned VF); 407 408 /// Returns the execution time cost of an instruction for a given vector 409 /// width. Vector width of one means scalar. 410 unsigned getInstructionCost(Instruction *I, unsigned VF); 411 412 /// A helper function for converting Scalar types to vector types. 413 /// If the incoming type is void, we return void. If the VF is 1, we return 414 /// the scalar type. 415 static Type* ToVectorTy(Type *Scalar, unsigned VF); 416 417 /// The loop that we evaluate. 418 Loop *TheLoop; 419 /// Scev analysis. 420 ScalarEvolution *SE; 421 422 /// Vectorization legality. 423 LoopVectorizationLegality *Legal; 424 /// Vector target information. 425 const VectorTargetTransformInfo *VTTI; 426}; 427 428struct LoopVectorize : public LoopPass { 429 static char ID; // Pass identification, replacement for typeid 430 431 LoopVectorize() : LoopPass(ID) { 432 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 433 } 434 435 ScalarEvolution *SE; 436 DataLayout *DL; 437 LoopInfo *LI; 438 TargetTransformInfo *TTI; 439 DominatorTree *DT; 440 441 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 442 // We only vectorize innermost loops. 443 if (!L->empty()) 444 return false; 445 446 SE = &getAnalysis<ScalarEvolution>(); 447 DL = getAnalysisIfAvailable<DataLayout>(); 448 LI = &getAnalysis<LoopInfo>(); 449 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 450 DT = &getAnalysis<DominatorTree>(); 451 452 DEBUG(dbgs() << "LV: Checking a loop in \"" << 453 L->getHeader()->getParent()->getName() << "\"\n"); 454 455 // Check if it is legal to vectorize the loop. 456 LoopVectorizationLegality LVL(L, SE, DL); 457 if (!LVL.canVectorize()) { 458 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 459 return false; 460 } 461 462 // Select the preffered vectorization factor. 463 unsigned VF = 1; 464 if (VectorizationFactor == 0) { 465 const VectorTargetTransformInfo *VTTI = 0; 466 if (TTI) 467 VTTI = TTI->getVectorTargetTransformInfo(); 468 // Use the cost model. 469 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 470 VF = CM.findBestVectorizationFactor(); 471 472 if (VF == 1) { 473 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 474 return false; 475 } 476 477 } else { 478 // Use the user command flag. 479 VF = VectorizationFactor; 480 } 481 482 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 483 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 484 "\n"); 485 486 // If we decided that it is *legal* to vectorizer the loop then do it. 487 SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF); 488 LB.vectorize(&LVL); 489 490 DEBUG(verifyFunction(*L->getHeader()->getParent())); 491 return true; 492 } 493 494 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 495 LoopPass::getAnalysisUsage(AU); 496 AU.addRequiredID(LoopSimplifyID); 497 AU.addRequiredID(LCSSAID); 498 AU.addRequired<LoopInfo>(); 499 AU.addRequired<ScalarEvolution>(); 500 AU.addRequired<DominatorTree>(); 501 AU.addPreserved<LoopInfo>(); 502 AU.addPreserved<DominatorTree>(); 503 } 504 505}; 506 507Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 508 // Create the types. 509 LLVMContext &C = V->getContext(); 510 Type *VTy = VectorType::get(V->getType(), VF); 511 Type *I32 = IntegerType::getInt32Ty(C); 512 Constant *Zero = ConstantInt::get(I32, 0); 513 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 514 Value *UndefVal = UndefValue::get(VTy); 515 // Insert the value into a new vector. 516 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 517 // Broadcast the scalar into all locations in the vector. 518 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 519 "broadcast"); 520 // We are accessing the induction variable. Make sure to promote the 521 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 522 if (V == Induction) 523 return getConsecutiveVector(Shuf); 524 return Shuf; 525} 526 527Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 528 assert(Val->getType()->isVectorTy() && "Must be a vector"); 529 assert(Val->getType()->getScalarType()->isIntegerTy() && 530 "Elem must be an integer"); 531 // Create the types. 532 Type *ITy = Val->getType()->getScalarType(); 533 VectorType *Ty = cast<VectorType>(Val->getType()); 534 unsigned VLen = Ty->getNumElements(); 535 SmallVector<Constant*, 8> Indices; 536 537 // Create a vector of consecutive numbers from zero to VF. 538 for (unsigned i = 0; i < VLen; ++i) 539 Indices.push_back(ConstantInt::get(ITy, i)); 540 541 // Add the consecutive indices to the vector value. 542 Constant *Cv = ConstantVector::get(Indices); 543 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 544 return Builder.CreateAdd(Val, Cv, "induction"); 545} 546 547bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 548 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 549 550 // If this pointer is an induction variable, return it. 551 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 552 if (Phi && getInductionVars()->count(Phi)) 553 return true; 554 555 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 556 if (!Gep) 557 return false; 558 559 unsigned NumOperands = Gep->getNumOperands(); 560 Value *LastIndex = Gep->getOperand(NumOperands - 1); 561 562 // Check that all of the gep indices are uniform except for the last. 563 for (unsigned i = 0; i < NumOperands - 1; ++i) 564 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 565 return false; 566 567 // We can emit wide load/stores only of the last index is the induction 568 // variable. 569 const SCEV *Last = SE->getSCEV(LastIndex); 570 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 571 const SCEV *Step = AR->getStepRecurrence(*SE); 572 573 // The memory is consecutive because the last index is consecutive 574 // and all other indices are loop invariant. 575 if (Step->isOne()) 576 return true; 577 } 578 579 return false; 580} 581 582bool LoopVectorizationLegality::isUniform(Value *V) { 583 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 584} 585 586Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 587 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 588 // If we saved a vectorized copy of V, use it. 589 Value *&MapEntry = WidenMap[V]; 590 if (MapEntry) 591 return MapEntry; 592 593 // Broadcast V and save the value for future uses. 594 Value *B = getBroadcastInstrs(V); 595 MapEntry = B; 596 return B; 597} 598 599Constant* 600SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 601 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 602} 603 604void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 605 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 606 // Holds vector parameters or scalars, in case of uniform vals. 607 SmallVector<Value*, 8> Params; 608 609 // Find all of the vectorized parameters. 610 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 611 Value *SrcOp = Instr->getOperand(op); 612 613 // If we are accessing the old induction variable, use the new one. 614 if (SrcOp == OldInduction) { 615 Params.push_back(getVectorValue(Induction)); 616 continue; 617 } 618 619 // Try using previously calculated values. 620 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 621 622 // If the src is an instruction that appeared earlier in the basic block 623 // then it should already be vectorized. 624 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 625 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 626 // The parameter is a vector value from earlier. 627 Params.push_back(WidenMap[SrcInst]); 628 } else { 629 // The parameter is a scalar from outside the loop. Maybe even a constant. 630 Params.push_back(SrcOp); 631 } 632 } 633 634 assert(Params.size() == Instr->getNumOperands() && 635 "Invalid number of operands"); 636 637 // Does this instruction return a value ? 638 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 639 Value *VecResults = 0; 640 641 // If we have a return value, create an empty vector. We place the scalarized 642 // instructions in this vector. 643 if (!IsVoidRetTy) 644 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 645 646 // For each scalar that we create: 647 for (unsigned i = 0; i < VF; ++i) { 648 Instruction *Cloned = Instr->clone(); 649 if (!IsVoidRetTy) 650 Cloned->setName(Instr->getName() + ".cloned"); 651 // Replace the operands of the cloned instrucions with extracted scalars. 652 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 653 Value *Op = Params[op]; 654 // Param is a vector. Need to extract the right lane. 655 if (Op->getType()->isVectorTy()) 656 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 657 Cloned->setOperand(op, Op); 658 } 659 660 // Place the cloned scalar in the new loop. 661 Builder.Insert(Cloned); 662 663 // If the original scalar returns a value we need to place it in a vector 664 // so that future users will be able to use it. 665 if (!IsVoidRetTy) 666 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 667 Builder.getInt32(i)); 668 } 669 670 if (!IsVoidRetTy) 671 WidenMap[Instr] = VecResults; 672} 673 674void 675SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 676 /* 677 In this function we generate a new loop. The new loop will contain 678 the vectorized instructions while the old loop will continue to run the 679 scalar remainder. 680 681 [ ] <-- vector loop bypass. 682 / | 683 / v 684| [ ] <-- vector pre header. 685| | 686| v 687| [ ] \ 688| [ ]_| <-- vector loop. 689| | 690 \ v 691 >[ ] <--- middle-block. 692 / | 693 / v 694| [ ] <--- new preheader. 695| | 696| v 697| [ ] \ 698| [ ]_| <-- old scalar loop to handle remainder. 699 \ | 700 \ v 701 >[ ] <-- exit block. 702 ... 703 */ 704 705 // Some loops have a single integer induction variable, while other loops 706 // don't. One example is c++ iterators that often have multiple pointer 707 // induction variables. In the code below we also support a case where we 708 // don't have a single induction variable. 709 OldInduction = Legal->getInduction(); 710 Type *IdxTy = OldInduction ? OldInduction->getType() : 711 DL->getIntPtrType(SE->getContext()); 712 713 // Find the loop boundaries. 714 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 715 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 716 717 // Get the total trip count from the count by adding 1. 718 ExitCount = SE->getAddExpr(ExitCount, 719 SE->getConstant(ExitCount->getType(), 1)); 720 721 // This is the original scalar-loop preheader. 722 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 723 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 724 assert(ExitBlock && "Must have an exit block"); 725 726 // The loop index does not have to start at Zero. Find the original start 727 // value from the induction PHI node. If we don't have an induction variable 728 // then we know that it starts at zero. 729 Value *StartIdx = OldInduction ? 730 OldInduction->getIncomingValueForBlock(BypassBlock): 731 ConstantInt::get(IdxTy, 0); 732 733 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 734 assert(BypassBlock && "Invalid loop structure"); 735 736 BasicBlock *VectorPH = 737 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 738 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 739 "vector.body"); 740 741 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 742 "middle.block"); 743 BasicBlock *ScalarPH = 744 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 745 "scalar.preheader"); 746 // Find the induction variable. 747 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 748 749 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 750 // inside the loop. 751 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 752 753 // Generate the induction variable. 754 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 755 Constant *Step = ConstantInt::get(IdxTy, VF); 756 757 // Expand the trip count and place the new instructions in the preheader. 758 // Notice that the pre-header does not change, only the loop body. 759 SCEVExpander Exp(*SE, "induction"); 760 Instruction *Loc = BypassBlock->getTerminator(); 761 762 // Count holds the overall loop count (N). 763 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc); 764 765 // We may need to extend the index in case there is a type mismatch. 766 // We know that the count starts at zero and does not overflow. 767 if (Count->getType() != IdxTy) { 768 // The exit count can be of pointer type. Convert it to the correct 769 // integer type. 770 if (ExitCount->getType()->isPointerTy()) 771 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 772 else 773 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 774 } 775 776 // Add the start index to the loop count to get the new end index. 777 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 778 779 // Now we need to generate the expression for N - (N % VF), which is 780 // the part that the vectorized body will execute. 781 Constant *CIVF = ConstantInt::get(IdxTy, VF); 782 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 783 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 784 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 785 "end.idx.rnd.down", Loc); 786 787 // Now, compare the new count to zero. If it is zero skip the vector loop and 788 // jump to the scalar loop. 789 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 790 IdxEndRoundDown, 791 StartIdx, 792 "cmp.zero", Loc); 793 794 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 795 Legal->getRuntimePointerCheck(); 796 Value *MemoryRuntimeCheck = 0; 797 if (PtrRtCheck->Need) { 798 unsigned NumPointers = PtrRtCheck->Pointers.size(); 799 SmallVector<Value* , 2> Starts; 800 SmallVector<Value* , 2> Ends; 801 802 // Use this type for pointer arithmetic. 803 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 804 805 for (unsigned i=0; i < NumPointers; ++i) { 806 Value *Ptr = PtrRtCheck->Pointers[i]; 807 const SCEV *Sc = SE->getSCEV(Ptr); 808 809 if (SE->isLoopInvariant(Sc, OrigLoop)) { 810 DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << 811 *Ptr <<"\n"); 812 Starts.push_back(Ptr); 813 Ends.push_back(Ptr); 814 } else { 815 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 816 817 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], 818 PtrArithTy, Loc); 819 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 820 Starts.push_back(Start); 821 Ends.push_back(End); 822 } 823 } 824 825 for (unsigned i = 0; i < NumPointers; ++i) { 826 for (unsigned j = i+1; j < NumPointers; ++j) { 827 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 828 Starts[i], Ends[j], "bound0", Loc); 829 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 830 Starts[j], Ends[i], "bound1", Loc); 831 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 832 "found.conflict", Loc); 833 if (MemoryRuntimeCheck) { 834 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 835 MemoryRuntimeCheck, 836 IsConflict, 837 "conflict.rdx", Loc); 838 } else { 839 MemoryRuntimeCheck = IsConflict; 840 } 841 } 842 } 843 }// end of need-runtime-check code. 844 845 // If we are using memory runtime checks, include them in. 846 if (MemoryRuntimeCheck) { 847 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 848 "CntOrMem", Loc); 849 } 850 851 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 852 // Remove the old terminator. 853 Loc->eraseFromParent(); 854 855 // We are going to resume the execution of the scalar loop. 856 // Go over all of the induction variables that we found and fix the 857 // PHIs that are left in the scalar version of the loop. 858 // The starting values of PHI nodes depend on the counter of the last 859 // iteration in the vectorized loop. 860 // If we come from a bypass edge then we need to start from the original start 861 // value. 862 863 // This variable saves the new starting index for the scalar loop. 864 PHINode *ResumeIndex = 0; 865 LoopVectorizationLegality::InductionList::iterator I, E; 866 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 867 for (I = List->begin(), E = List->end(); I != E; ++I) { 868 PHINode *OrigPhi = I->first; 869 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 870 MiddleBlock->getTerminator()); 871 Value *EndValue = 0; 872 if (OrigPhi->getType()->isIntegerTy()) { 873 // Handle the integer induction counter: 874 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 875 // We know what the end value is. 876 EndValue = IdxEndRoundDown; 877 // We also know which PHI node holds it. 878 ResumeIndex = ResumeVal; 879 } else { 880 // For pointer induction variables, calculate the offset using 881 // the end index. 882 EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, 883 "ptr.ind.end", 884 BypassBlock->getTerminator()); 885 } 886 887 // The new PHI merges the original incoming value, in case of a bypass, 888 // or the value at the end of the vectorized loop. 889 ResumeVal->addIncoming(I->second, BypassBlock); 890 ResumeVal->addIncoming(EndValue, VecBody); 891 892 // Fix the scalar body counter (PHI node). 893 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 894 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 895 } 896 897 // If we are generating a new induction variable then we also need to 898 // generate the code that calculates the exit value. This value is not 899 // simply the end of the counter because we may skip the vectorized body 900 // in case of a runtime check. 901 if (!OldInduction){ 902 assert(!ResumeIndex && "Unexpected resume value found"); 903 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 904 MiddleBlock->getTerminator()); 905 ResumeIndex->addIncoming(StartIdx, BypassBlock); 906 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 907 } 908 909 // Make sure that we found the index where scalar loop needs to continue. 910 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 911 "Invalid resume Index"); 912 913 // Add a check in the middle block to see if we have completed 914 // all of the iterations in the first vector loop. 915 // If (N - N%VF) == N, then we *don't* need to run the remainder. 916 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 917 ResumeIndex, "cmp.n", 918 MiddleBlock->getTerminator()); 919 920 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 921 // Remove the old terminator. 922 MiddleBlock->getTerminator()->eraseFromParent(); 923 924 // Create i+1 and fill the PHINode. 925 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 926 Induction->addIncoming(StartIdx, VectorPH); 927 Induction->addIncoming(NextIdx, VecBody); 928 // Create the compare. 929 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 930 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 931 932 // Now we have two terminators. Remove the old one from the block. 933 VecBody->getTerminator()->eraseFromParent(); 934 935 // Get ready to start creating new instructions into the vectorized body. 936 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 937 938 // Register the new loop. 939 Loop* Lp = new Loop(); 940 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 941 942 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 943 944 Loop *ParentLoop = OrigLoop->getParentLoop(); 945 if (ParentLoop) { 946 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 947 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 948 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 949 } 950 951 // Save the state. 952 LoopVectorPreHeader = VectorPH; 953 LoopScalarPreHeader = ScalarPH; 954 LoopMiddleBlock = MiddleBlock; 955 LoopExitBlock = ExitBlock; 956 LoopVectorBody = VecBody; 957 LoopScalarBody = OldBasicBlock; 958 LoopBypassBlock = BypassBlock; 959} 960 961/// This function returns the identity element (or neutral element) for 962/// the operation K. 963static unsigned 964getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 965 switch (K) { 966 case LoopVectorizationLegality::IntegerXor: 967 case LoopVectorizationLegality::IntegerAdd: 968 case LoopVectorizationLegality::IntegerOr: 969 // Adding, Xoring, Oring zero to a number does not change it. 970 return 0; 971 case LoopVectorizationLegality::IntegerMult: 972 // Multiplying a number by 1 does not change it. 973 return 1; 974 case LoopVectorizationLegality::IntegerAnd: 975 // AND-ing a number with an all-1 value does not change it. 976 return -1; 977 default: 978 llvm_unreachable("Unknown reduction kind"); 979 } 980} 981 982void 983SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 984 //===------------------------------------------------===// 985 // 986 // Notice: any optimization or new instruction that go 987 // into the code below should be also be implemented in 988 // the cost-model. 989 // 990 //===------------------------------------------------===// 991 typedef SmallVector<PHINode*, 4> PhiVector; 992 BasicBlock &BB = *OrigLoop->getHeader(); 993 Constant *Zero = ConstantInt::get( 994 IntegerType::getInt32Ty(BB.getContext()), 0); 995 996 // In order to support reduction variables we need to be able to vectorize 997 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 998 // steages. First, we create a new vector PHI node with no incoming edges. 999 // We use this value when we vectorize all of the instructions that use the 1000 // PHI. Next, after all of the instructions in the block are complete we 1001 // add the new incoming edges to the PHI. At this point all of the 1002 // instructions in the basic block are vectorized, so we can use them to 1003 // construct the PHI. 1004 PhiVector RdxPHIsToFix; 1005 1006 // For each instruction in the old loop. 1007 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1008 Instruction *Inst = it; 1009 1010 switch (Inst->getOpcode()) { 1011 case Instruction::Br: 1012 // Nothing to do for PHIs and BR, since we already took care of the 1013 // loop control flow instructions. 1014 continue; 1015 case Instruction::PHI:{ 1016 PHINode* P = cast<PHINode>(Inst); 1017 // Handle reduction variables: 1018 if (Legal->getReductionVars()->count(P)) { 1019 // This is phase one of vectorizing PHIs. 1020 Type *VecTy = VectorType::get(Inst->getType(), VF); 1021 WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", 1022 LoopVectorBody->getFirstInsertionPt()); 1023 RdxPHIsToFix.push_back(P); 1024 continue; 1025 } 1026 1027 // This PHINode must be an induction variable. 1028 // Make sure that we know about it. 1029 assert(Legal->getInductionVars()->count(P) && 1030 "Not an induction variable"); 1031 1032 if (P->getType()->isIntegerTy()) { 1033 assert(P == OldInduction && "Unexpected PHI"); 1034 WidenMap[Inst] = getBroadcastInstrs(Induction); 1035 continue; 1036 } 1037 1038 // Handle pointer inductions: 1039 assert(P->getType()->isPointerTy() && "Unexpected type."); 1040 Value *StartIdx = OldInduction ? 1041 Legal->getInductionVars()->lookup(OldInduction) : 1042 ConstantInt::get(Induction->getType(), 0); 1043 1044 // This is the pointer value coming into the loop. 1045 Value *StartPtr = Legal->getInductionVars()->lookup(P); 1046 1047 // This is the normalized GEP that starts counting at zero. 1048 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1049 "normalized.idx"); 1050 1051 // This is the vector of results. Notice that we don't generate vector 1052 // geps because scalar geps result in better code. 1053 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1054 for (unsigned int i = 0; i < VF; ++i) { 1055 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1056 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1057 Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); 1058 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1059 Builder.getInt32(i), 1060 "insert.gep"); 1061 } 1062 1063 WidenMap[Inst] = VecVal; 1064 continue; 1065 } 1066 case Instruction::Add: 1067 case Instruction::FAdd: 1068 case Instruction::Sub: 1069 case Instruction::FSub: 1070 case Instruction::Mul: 1071 case Instruction::FMul: 1072 case Instruction::UDiv: 1073 case Instruction::SDiv: 1074 case Instruction::FDiv: 1075 case Instruction::URem: 1076 case Instruction::SRem: 1077 case Instruction::FRem: 1078 case Instruction::Shl: 1079 case Instruction::LShr: 1080 case Instruction::AShr: 1081 case Instruction::And: 1082 case Instruction::Or: 1083 case Instruction::Xor: { 1084 // Just widen binops. 1085 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 1086 Value *A = getVectorValue(Inst->getOperand(0)); 1087 Value *B = getVectorValue(Inst->getOperand(1)); 1088 1089 // Use this vector value for all users of the original instruction. 1090 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1091 WidenMap[Inst] = V; 1092 1093 // Update the NSW, NUW and Exact flags. 1094 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1095 if (isa<OverflowingBinaryOperator>(BinOp)) { 1096 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1097 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1098 } 1099 if (isa<PossiblyExactOperator>(VecOp)) 1100 VecOp->setIsExact(BinOp->isExact()); 1101 break; 1102 } 1103 case Instruction::Select: { 1104 // Widen selects. 1105 // If the selector is loop invariant we can create a select 1106 // instruction with a scalar condition. Otherwise, use vector-select. 1107 Value *Cond = Inst->getOperand(0); 1108 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1109 1110 // The condition can be loop invariant but still defined inside the 1111 // loop. This means that we can't just use the original 'cond' value. 1112 // We have to take the 'vectorized' value and pick the first lane. 1113 // Instcombine will make this a no-op. 1114 Cond = getVectorValue(Cond); 1115 if (InvariantCond) 1116 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1117 1118 Value *Op0 = getVectorValue(Inst->getOperand(1)); 1119 Value *Op1 = getVectorValue(Inst->getOperand(2)); 1120 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 1121 break; 1122 } 1123 1124 case Instruction::ICmp: 1125 case Instruction::FCmp: { 1126 // Widen compares. Generate vector compares. 1127 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 1128 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 1129 Value *A = getVectorValue(Inst->getOperand(0)); 1130 Value *B = getVectorValue(Inst->getOperand(1)); 1131 if (FCmp) 1132 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1133 else 1134 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1135 break; 1136 } 1137 1138 case Instruction::Store: { 1139 // Attempt to issue a wide store. 1140 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1141 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1142 Value *Ptr = SI->getPointerOperand(); 1143 unsigned Alignment = SI->getAlignment(); 1144 1145 assert(!Legal->isUniform(Ptr) && 1146 "We do not allow storing to uniform addresses"); 1147 1148 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1149 1150 // This store does not use GEPs. 1151 if (!Legal->isConsecutivePtr(Ptr)) { 1152 scalarizeInstruction(Inst); 1153 break; 1154 } 1155 1156 if (Gep) { 1157 // The last index does not have to be the induction. It can be 1158 // consecutive and be a function of the index. For example A[I+1]; 1159 unsigned NumOperands = Gep->getNumOperands(); 1160 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1161 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1162 1163 // Create the new GEP with the new induction variable. 1164 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1165 Gep2->setOperand(NumOperands - 1, LastIndex); 1166 Ptr = Builder.Insert(Gep2); 1167 } else { 1168 // Use the induction element ptr. 1169 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1170 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1171 } 1172 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1173 Value *Val = getVectorValue(SI->getValueOperand()); 1174 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1175 break; 1176 } 1177 case Instruction::Load: { 1178 // Attempt to issue a wide load. 1179 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1180 Type *RetTy = VectorType::get(LI->getType(), VF); 1181 Value *Ptr = LI->getPointerOperand(); 1182 unsigned Alignment = LI->getAlignment(); 1183 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1184 1185 // If the pointer is loop invariant or if it is non consecutive, 1186 // scalarize the load. 1187 bool Con = Legal->isConsecutivePtr(Ptr); 1188 if (Legal->isUniform(Ptr) || !Con) { 1189 scalarizeInstruction(Inst); 1190 break; 1191 } 1192 1193 if (Gep) { 1194 // The last index does not have to be the induction. It can be 1195 // consecutive and be a function of the index. For example A[I+1]; 1196 unsigned NumOperands = Gep->getNumOperands(); 1197 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1198 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1199 1200 // Create the new GEP with the new induction variable. 1201 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1202 Gep2->setOperand(NumOperands - 1, LastIndex); 1203 Ptr = Builder.Insert(Gep2); 1204 } else { 1205 // Use the induction element ptr. 1206 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1207 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1208 } 1209 1210 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1211 LI = Builder.CreateLoad(Ptr); 1212 LI->setAlignment(Alignment); 1213 // Use this vector value for all users of the load. 1214 WidenMap[Inst] = LI; 1215 break; 1216 } 1217 case Instruction::ZExt: 1218 case Instruction::SExt: 1219 case Instruction::FPToUI: 1220 case Instruction::FPToSI: 1221 case Instruction::FPExt: 1222 case Instruction::PtrToInt: 1223 case Instruction::IntToPtr: 1224 case Instruction::SIToFP: 1225 case Instruction::UIToFP: 1226 case Instruction::Trunc: 1227 case Instruction::FPTrunc: 1228 case Instruction::BitCast: { 1229 /// Vectorize bitcasts. 1230 CastInst *CI = dyn_cast<CastInst>(Inst); 1231 Value *A = getVectorValue(Inst->getOperand(0)); 1232 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1233 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1234 break; 1235 } 1236 1237 default: 1238 /// All other instructions are unsupported. Scalarize them. 1239 scalarizeInstruction(Inst); 1240 break; 1241 }// end of switch. 1242 }// end of for_each instr. 1243 1244 // At this point every instruction in the original loop is widended to 1245 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1246 // that we vectorized. The PHI nodes are currently empty because we did 1247 // not want to introduce cycles. Notice that the remaining PHI nodes 1248 // that we need to fix are reduction variables. 1249 1250 // Create the 'reduced' values for each of the induction vars. 1251 // The reduced values are the vector values that we scalarize and combine 1252 // after the loop is finished. 1253 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1254 it != e; ++it) { 1255 PHINode *RdxPhi = *it; 1256 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1257 assert(RdxPhi && "Unable to recover vectorized PHI"); 1258 1259 // Find the reduction variable descriptor. 1260 assert(Legal->getReductionVars()->count(RdxPhi) && 1261 "Unable to find the reduction variable"); 1262 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1263 (*Legal->getReductionVars())[RdxPhi]; 1264 1265 // We need to generate a reduction vector from the incoming scalar. 1266 // To do so, we need to generate the 'identity' vector and overide 1267 // one of the elements with the incoming scalar reduction. We need 1268 // to do it in the vector-loop preheader. 1269 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1270 1271 // This is the vector-clone of the value that leaves the loop. 1272 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1273 Type *VecTy = VectorExit->getType(); 1274 1275 // Find the reduction identity variable. Zero for addition, or, xor, 1276 // one for multiplication, -1 for And. 1277 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1278 VecTy->getScalarType()); 1279 1280 // This vector is the Identity vector where the first element is the 1281 // incoming scalar reduction. 1282 Value *VectorStart = Builder.CreateInsertElement(Identity, 1283 RdxDesc.StartValue, Zero); 1284 1285 // Fix the vector-loop phi. 1286 // We created the induction variable so we know that the 1287 // preheader is the first entry. 1288 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1289 1290 // Reductions do not have to start at zero. They can start with 1291 // any loop invariant values. 1292 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1293 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1294 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1295 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1296 1297 // Before each round, move the insertion point right between 1298 // the PHIs and the values we are going to write. 1299 // This allows us to write both PHINodes and the extractelement 1300 // instructions. 1301 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1302 1303 // This PHINode contains the vectorized reduction variable, or 1304 // the initial value vector, if we bypass the vector loop. 1305 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1306 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1307 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1308 1309 // Extract the first scalar. 1310 Value *Scalar0 = 1311 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1312 // Extract and reduce the remaining vector elements. 1313 for (unsigned i=1; i < VF; ++i) { 1314 Value *Scalar1 = 1315 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1316 switch (RdxDesc.Kind) { 1317 case LoopVectorizationLegality::IntegerAdd: 1318 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1319 break; 1320 case LoopVectorizationLegality::IntegerMult: 1321 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1322 break; 1323 case LoopVectorizationLegality::IntegerOr: 1324 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1325 break; 1326 case LoopVectorizationLegality::IntegerAnd: 1327 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1328 break; 1329 case LoopVectorizationLegality::IntegerXor: 1330 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1331 break; 1332 default: 1333 llvm_unreachable("Unknown reduction operation"); 1334 } 1335 } 1336 1337 // Now, we need to fix the users of the reduction variable 1338 // inside and outside of the scalar remainder loop. 1339 // We know that the loop is in LCSSA form. We need to update the 1340 // PHI nodes in the exit blocks. 1341 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1342 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1343 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1344 if (!LCSSAPhi) continue; 1345 1346 // All PHINodes need to have a single entry edge, or two if 1347 // we already fixed them. 1348 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1349 1350 // We found our reduction value exit-PHI. Update it with the 1351 // incoming bypass edge. 1352 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1353 // Add an edge coming from the bypass. 1354 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1355 break; 1356 } 1357 }// end of the LCSSA phi scan. 1358 1359 // Fix the scalar loop reduction variable with the incoming reduction sum 1360 // from the vector body and from the backedge value. 1361 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1362 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1363 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1364 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1365 }// end of for each redux variable. 1366} 1367 1368void SingleBlockLoopVectorizer::updateAnalysis() { 1369 // The original basic block. 1370 SE->forgetLoop(OrigLoop); 1371 1372 // Update the dominator tree information. 1373 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1374 "Entry does not dominate exit."); 1375 1376 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1377 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1378 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1379 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1380 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1381 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1382 1383 DEBUG(DT->verifyAnalysis()); 1384} 1385 1386bool LoopVectorizationLegality::canVectorize() { 1387 if (!TheLoop->getLoopPreheader()) { 1388 assert(false && "No preheader!!"); 1389 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1390 return false; 1391 } 1392 1393 // We can only vectorize single basic block loops. 1394 unsigned NumBlocks = TheLoop->getNumBlocks(); 1395 if (NumBlocks != 1) { 1396 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1397 return false; 1398 } 1399 1400 // We need to have a loop header. 1401 BasicBlock *BB = TheLoop->getHeader(); 1402 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1403 1404 // ScalarEvolution needs to be able to find the exit count. 1405 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1406 if (ExitCount == SE->getCouldNotCompute()) { 1407 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1408 return false; 1409 } 1410 1411 // Do not loop-vectorize loops with a tiny trip count. 1412 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1413 if (TC > 0u && TC < TinyTripCountThreshold) { 1414 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1415 "This loop is not worth vectorizing.\n"); 1416 return false; 1417 } 1418 1419 // Go over each instruction and look at memory deps. 1420 if (!canVectorizeBlock(*BB)) { 1421 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1422 return false; 1423 } 1424 1425 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1426 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1427 <<"!\n"); 1428 1429 // Okay! We can vectorize. At this point we don't have any other mem analysis 1430 // which may limit our maximum vectorization factor, so just return true with 1431 // no restrictions. 1432 return true; 1433} 1434 1435bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1436 1437 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1438 1439 // Scan the instructions in the block and look for hazards. 1440 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1441 Instruction *I = it; 1442 1443 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 1444 // This should not happen because the loop should be normalized. 1445 if (Phi->getNumIncomingValues() != 2) { 1446 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1447 return false; 1448 } 1449 1450 // This is the value coming from the preheader. 1451 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1452 1453 // We only look at integer and pointer phi nodes. 1454 if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { 1455 DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); 1456 Inductions[Phi] = StartValue; 1457 continue; 1458 } else if (!Phi->getType()->isIntegerTy()) { 1459 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1460 return false; 1461 } 1462 1463 // Handle integer PHIs: 1464 if (isInductionVariable(Phi)) { 1465 if (Induction) { 1466 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1467 return false; 1468 } 1469 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1470 Induction = Phi; 1471 Inductions[Phi] = StartValue; 1472 continue; 1473 } 1474 if (AddReductionVar(Phi, IntegerAdd)) { 1475 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1476 continue; 1477 } 1478 if (AddReductionVar(Phi, IntegerMult)) { 1479 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1480 continue; 1481 } 1482 if (AddReductionVar(Phi, IntegerOr)) { 1483 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1484 continue; 1485 } 1486 if (AddReductionVar(Phi, IntegerAnd)) { 1487 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1488 continue; 1489 } 1490 if (AddReductionVar(Phi, IntegerXor)) { 1491 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1492 continue; 1493 } 1494 1495 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1496 return false; 1497 }// end of PHI handling 1498 1499 // We still don't handle functions. 1500 CallInst *CI = dyn_cast<CallInst>(I); 1501 if (CI) { 1502 DEBUG(dbgs() << "LV: Found a call site.\n"); 1503 return false; 1504 } 1505 1506 // We do not re-vectorize vectors. 1507 if (!VectorType::isValidElementType(I->getType()) && 1508 !I->getType()->isVoidTy()) { 1509 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1510 return false; 1511 } 1512 1513 // Reduction instructions are allowed to have exit users. 1514 // All other instructions must not have external users. 1515 if (!AllowedExit.count(I)) 1516 //Check that all of the users of the loop are inside the BB. 1517 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1518 it != e; ++it) { 1519 Instruction *U = cast<Instruction>(*it); 1520 // This user may be a reduction exit value. 1521 BasicBlock *Parent = U->getParent(); 1522 if (Parent != &BB) { 1523 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1524 return false; 1525 } 1526 } 1527 } // next instr. 1528 1529 if (!Induction) { 1530 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1531 assert(getInductionVars()->size() && "No induction variables"); 1532 } 1533 1534 // Don't vectorize if the memory dependencies do not allow vectorization. 1535 if (!canVectorizeMemory(BB)) 1536 return false; 1537 1538 // We now know that the loop is vectorizable! 1539 // Collect variables that will remain uniform after vectorization. 1540 std::vector<Value*> Worklist; 1541 1542 // Start with the conditional branch and walk up the block. 1543 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1544 1545 while (Worklist.size()) { 1546 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1547 Worklist.pop_back(); 1548 1549 // Look at instructions inside this block. Stop when reaching PHI nodes. 1550 if (!I || I->getParent() != &BB || isa<PHINode>(I)) 1551 continue; 1552 1553 // This is a known uniform. 1554 Uniforms.insert(I); 1555 1556 // Insert all operands. 1557 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1558 Worklist.push_back(I->getOperand(i)); 1559 } 1560 } 1561 1562 return true; 1563} 1564 1565bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1566 typedef SmallVector<Value*, 16> ValueVector; 1567 typedef SmallPtrSet<Value*, 16> ValueSet; 1568 // Holds the Load and Store *instructions*. 1569 ValueVector Loads; 1570 ValueVector Stores; 1571 PtrRtCheck.Pointers.clear(); 1572 PtrRtCheck.Need = false; 1573 1574 // Scan the BB and collect legal loads and stores. 1575 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1576 Instruction *I = it; 1577 1578 // If this is a load, save it. If this instruction can read from memory 1579 // but is not a load, then we quit. Notice that we don't handle function 1580 // calls that read or write. 1581 if (I->mayReadFromMemory()) { 1582 LoadInst *Ld = dyn_cast<LoadInst>(I); 1583 if (!Ld) return false; 1584 if (!Ld->isSimple()) { 1585 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1586 return false; 1587 } 1588 Loads.push_back(Ld); 1589 continue; 1590 } 1591 1592 // Save store instructions. Abort if other instructions write to memory. 1593 if (I->mayWriteToMemory()) { 1594 StoreInst *St = dyn_cast<StoreInst>(I); 1595 if (!St) return false; 1596 if (!St->isSimple()) { 1597 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1598 return false; 1599 } 1600 Stores.push_back(St); 1601 } 1602 } // next instr. 1603 1604 // Now we have two lists that hold the loads and the stores. 1605 // Next, we find the pointers that they use. 1606 1607 // Check if we see any stores. If there are no stores, then we don't 1608 // care if the pointers are *restrict*. 1609 if (!Stores.size()) { 1610 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1611 return true; 1612 } 1613 1614 // Holds the read and read-write *pointers* that we find. 1615 ValueVector Reads; 1616 ValueVector ReadWrites; 1617 1618 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1619 // multiple times on the same object. If the ptr is accessed twice, once 1620 // for read and once for write, it will only appear once (on the write 1621 // list). This is okay, since we are going to check for conflicts between 1622 // writes and between reads and writes, but not between reads and reads. 1623 ValueSet Seen; 1624 1625 ValueVector::iterator I, IE; 1626 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1627 StoreInst *ST = dyn_cast<StoreInst>(*I); 1628 assert(ST && "Bad StoreInst"); 1629 Value* Ptr = ST->getPointerOperand(); 1630 1631 if (isUniform(Ptr)) { 1632 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1633 return false; 1634 } 1635 1636 // If we did *not* see this pointer before, insert it to 1637 // the read-write list. At this phase it is only a 'write' list. 1638 if (Seen.insert(Ptr)) 1639 ReadWrites.push_back(Ptr); 1640 } 1641 1642 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1643 LoadInst *LD = dyn_cast<LoadInst>(*I); 1644 assert(LD && "Bad LoadInst"); 1645 Value* Ptr = LD->getPointerOperand(); 1646 // If we did *not* see this pointer before, insert it to the 1647 // read list. If we *did* see it before, then it is already in 1648 // the read-write list. This allows us to vectorize expressions 1649 // such as A[i] += x; Because the address of A[i] is a read-write 1650 // pointer. This only works if the index of A[i] is consecutive. 1651 // If the address of i is unknown (for example A[B[i]]) then we may 1652 // read a few words, modify, and write a few words, and some of the 1653 // words may be written to the same address. 1654 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1655 Reads.push_back(Ptr); 1656 } 1657 1658 // If we write (or read-write) to a single destination and there are no 1659 // other reads in this loop then is it safe to vectorize. 1660 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1661 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1662 return true; 1663 } 1664 1665 // Find pointers with computable bounds. We are going to use this information 1666 // to place a runtime bound check. 1667 bool RT = true; 1668 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1669 if (hasComputableBounds(*I)) { 1670 PtrRtCheck.insert_pointer(SE, TheLoop, *I); 1671 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1672 } else { 1673 RT = false; 1674 break; 1675 } 1676 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1677 if (hasComputableBounds(*I)) { 1678 PtrRtCheck.insert_pointer(SE, TheLoop, *I); 1679 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1680 } else { 1681 RT = false; 1682 break; 1683 } 1684 1685 // Check that we did not collect too many pointers or found a 1686 // unsizeable pointer. 1687 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1688 PtrRtCheck.reset(); 1689 RT = false; 1690 } 1691 1692 PtrRtCheck.Need = RT; 1693 1694 if (RT) { 1695 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1696 } 1697 1698 // Now that the pointers are in two lists (Reads and ReadWrites), we 1699 // can check that there are no conflicts between each of the writes and 1700 // between the writes to the reads. 1701 ValueSet WriteObjects; 1702 ValueVector TempObjects; 1703 1704 // Check that the read-writes do not conflict with other read-write 1705 // pointers. 1706 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1707 GetUnderlyingObjects(*I, TempObjects, DL); 1708 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1709 it != e; ++it) { 1710 if (!isIdentifiedObject(*it)) { 1711 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1712 return RT; 1713 } 1714 if (!WriteObjects.insert(*it)) { 1715 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1716 << **it <<"\n"); 1717 return RT; 1718 } 1719 } 1720 TempObjects.clear(); 1721 } 1722 1723 /// Check that the reads don't conflict with the read-writes. 1724 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1725 GetUnderlyingObjects(*I, TempObjects, DL); 1726 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1727 it != e; ++it) { 1728 if (!isIdentifiedObject(*it)) { 1729 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1730 return RT; 1731 } 1732 if (WriteObjects.count(*it)) { 1733 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1734 << **it <<"\n"); 1735 return RT; 1736 } 1737 } 1738 TempObjects.clear(); 1739 } 1740 1741 // It is safe to vectorize and we don't need any runtime checks. 1742 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1743 PtrRtCheck.reset(); 1744 return true; 1745} 1746 1747bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1748 ReductionKind Kind) { 1749 if (Phi->getNumIncomingValues() != 2) 1750 return false; 1751 1752 // Find the possible incoming reduction variable. 1753 BasicBlock *BB = Phi->getParent(); 1754 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1755 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1756 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1757 1758 // ExitInstruction is the single value which is used outside the loop. 1759 // We only allow for a single reduction value to be used outside the loop. 1760 // This includes users of the reduction, variables (which form a cycle 1761 // which ends in the phi node). 1762 Instruction *ExitInstruction = 0; 1763 1764 // Iter is our iterator. We start with the PHI node and scan for all of the 1765 // users of this instruction. All users must be instructions which can be 1766 // used as reduction variables (such as ADD). We may have a single 1767 // out-of-block user. They cycle must end with the original PHI. 1768 // Also, we can't have multiple block-local users. 1769 Instruction *Iter = Phi; 1770 while (true) { 1771 // Any reduction instr must be of one of the allowed kinds. 1772 if (!isReductionInstr(Iter, Kind)) 1773 return false; 1774 1775 // Did we found a user inside this block ? 1776 bool FoundInBlockUser = false; 1777 // Did we reach the initial PHI node ? 1778 bool FoundStartPHI = false; 1779 1780 // If the instruction has no users then this is a broken 1781 // chain and can't be a reduction variable. 1782 if (Iter->use_empty()) 1783 return false; 1784 1785 // For each of the *users* of iter. 1786 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1787 it != e; ++it) { 1788 Instruction *U = cast<Instruction>(*it); 1789 // We already know that the PHI is a user. 1790 if (U == Phi) { 1791 FoundStartPHI = true; 1792 continue; 1793 } 1794 // Check if we found the exit user. 1795 BasicBlock *Parent = U->getParent(); 1796 if (Parent != BB) { 1797 // We must have a single exit instruction. 1798 if (ExitInstruction != 0) 1799 return false; 1800 ExitInstruction = Iter; 1801 } 1802 // We can't have multiple inside users. 1803 if (FoundInBlockUser) 1804 return false; 1805 FoundInBlockUser = true; 1806 Iter = U; 1807 } 1808 1809 // We found a reduction var if we have reached the original 1810 // phi node and we only have a single instruction with out-of-loop 1811 // users. 1812 if (FoundStartPHI && ExitInstruction) { 1813 // This instruction is allowed to have out-of-loop users. 1814 AllowedExit.insert(ExitInstruction); 1815 1816 // Save the description of this reduction variable. 1817 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1818 Reductions[Phi] = RD; 1819 return true; 1820 } 1821 } 1822} 1823 1824bool 1825LoopVectorizationLegality::isReductionInstr(Instruction *I, 1826 ReductionKind Kind) { 1827 switch (I->getOpcode()) { 1828 default: 1829 return false; 1830 case Instruction::PHI: 1831 // possibly. 1832 return true; 1833 case Instruction::Add: 1834 case Instruction::Sub: 1835 return Kind == IntegerAdd; 1836 case Instruction::Mul: 1837 return Kind == IntegerMult; 1838 case Instruction::And: 1839 return Kind == IntegerAnd; 1840 case Instruction::Or: 1841 return Kind == IntegerOr; 1842 case Instruction::Xor: 1843 return Kind == IntegerXor; 1844 } 1845} 1846 1847bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1848 Type *PhiTy = Phi->getType(); 1849 // We only handle integer and pointer inductions variables. 1850 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1851 return false; 1852 1853 // Check that the PHI is consecutive and starts at zero. 1854 const SCEV *PhiScev = SE->getSCEV(Phi); 1855 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1856 if (!AR) { 1857 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1858 return false; 1859 } 1860 const SCEV *Step = AR->getStepRecurrence(*SE); 1861 1862 // Integer inductions need to have a stride of one. 1863 if (PhiTy->isIntegerTy()) 1864 return Step->isOne(); 1865 1866 // Calculate the pointer stride and check if it is consecutive. 1867 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1868 if (!C) return false; 1869 1870 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1871 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1872 return (C->getValue()->equalsInt(Size)); 1873} 1874 1875bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1876 const SCEV *PhiScev = SE->getSCEV(Ptr); 1877 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1878 if (!AR) 1879 return false; 1880 1881 return AR->isAffine(); 1882} 1883 1884unsigned 1885LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1886 if (!VTTI) { 1887 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1888 return 1; 1889 } 1890 1891 float Cost = expectedCost(1); 1892 unsigned Width = 1; 1893 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1894 for (unsigned i=2; i <= VF; i*=2) { 1895 // Notice that the vector loop needs to be executed less times, so 1896 // we need to divide the cost of the vector loops by the width of 1897 // the vector elements. 1898 float VectorCost = expectedCost(i) / (float)i; 1899 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1900 (int)VectorCost << ".\n"); 1901 if (VectorCost < Cost) { 1902 Cost = VectorCost; 1903 Width = i; 1904 } 1905 } 1906 1907 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1908 return Width; 1909} 1910 1911unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1912 // We can only estimate the cost of single basic block loops. 1913 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1914 1915 BasicBlock *BB = TheLoop->getHeader(); 1916 unsigned Cost = 0; 1917 1918 // For each instruction in the old loop. 1919 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1920 Instruction *Inst = it; 1921 unsigned C = getInstructionCost(Inst, VF); 1922 Cost += C; 1923 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1924 " For instruction: "<< *Inst << "\n"); 1925 } 1926 1927 return Cost; 1928} 1929 1930unsigned 1931LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1932 assert(VTTI && "Invalid vector target transformation info"); 1933 1934 // If we know that this instruction will remain uniform, check the cost of 1935 // the scalar version. 1936 if (Legal->isUniformAfterVectorization(I)) 1937 VF = 1; 1938 1939 Type *RetTy = I->getType(); 1940 Type *VectorTy = ToVectorTy(RetTy, VF); 1941 1942 1943 // TODO: We need to estimate the cost of intrinsic calls. 1944 switch (I->getOpcode()) { 1945 case Instruction::GetElementPtr: 1946 // We mark this instruction as zero-cost because scalar GEPs are usually 1947 // lowered to the intruction addressing mode. At the moment we don't 1948 // generate vector geps. 1949 return 0; 1950 case Instruction::Br: { 1951 return VTTI->getCFInstrCost(I->getOpcode()); 1952 } 1953 case Instruction::PHI: 1954 return 0; 1955 case Instruction::Add: 1956 case Instruction::FAdd: 1957 case Instruction::Sub: 1958 case Instruction::FSub: 1959 case Instruction::Mul: 1960 case Instruction::FMul: 1961 case Instruction::UDiv: 1962 case Instruction::SDiv: 1963 case Instruction::FDiv: 1964 case Instruction::URem: 1965 case Instruction::SRem: 1966 case Instruction::FRem: 1967 case Instruction::Shl: 1968 case Instruction::LShr: 1969 case Instruction::AShr: 1970 case Instruction::And: 1971 case Instruction::Or: 1972 case Instruction::Xor: { 1973 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1974 } 1975 case Instruction::Select: { 1976 SelectInst *SI = cast<SelectInst>(I); 1977 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1978 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1979 Type *CondTy = SI->getCondition()->getType(); 1980 if (ScalarCond) 1981 CondTy = VectorType::get(CondTy, VF); 1982 1983 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1984 } 1985 case Instruction::ICmp: 1986 case Instruction::FCmp: { 1987 Type *ValTy = I->getOperand(0)->getType(); 1988 VectorTy = ToVectorTy(ValTy, VF); 1989 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1990 } 1991 case Instruction::Store: { 1992 StoreInst *SI = cast<StoreInst>(I); 1993 Type *ValTy = SI->getValueOperand()->getType(); 1994 VectorTy = ToVectorTy(ValTy, VF); 1995 1996 if (VF == 1) 1997 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1998 SI->getAlignment(), SI->getPointerAddressSpace()); 1999 2000 // Scalarized stores. 2001 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2002 unsigned Cost = 0; 2003 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2004 ValTy); 2005 // The cost of extracting from the value vector. 2006 Cost += VF * (ExtCost); 2007 // The cost of the scalar stores. 2008 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2009 ValTy->getScalarType(), 2010 SI->getAlignment(), 2011 SI->getPointerAddressSpace()); 2012 return Cost; 2013 } 2014 2015 // Wide stores. 2016 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2017 SI->getPointerAddressSpace()); 2018 } 2019 case Instruction::Load: { 2020 LoadInst *LI = cast<LoadInst>(I); 2021 2022 if (VF == 1) 2023 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2024 LI->getAlignment(), 2025 LI->getPointerAddressSpace()); 2026 2027 // Scalarized loads. 2028 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2029 unsigned Cost = 0; 2030 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2031 // The cost of inserting the loaded value into the result vector. 2032 Cost += VF * (InCost); 2033 // The cost of the scalar stores. 2034 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2035 RetTy->getScalarType(), 2036 LI->getAlignment(), 2037 LI->getPointerAddressSpace()); 2038 return Cost; 2039 } 2040 2041 // Wide loads. 2042 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2043 LI->getPointerAddressSpace()); 2044 } 2045 case Instruction::ZExt: 2046 case Instruction::SExt: 2047 case Instruction::FPToUI: 2048 case Instruction::FPToSI: 2049 case Instruction::FPExt: 2050 case Instruction::PtrToInt: 2051 case Instruction::IntToPtr: 2052 case Instruction::SIToFP: 2053 case Instruction::UIToFP: 2054 case Instruction::Trunc: 2055 case Instruction::FPTrunc: 2056 case Instruction::BitCast: { 2057 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2058 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2059 } 2060 default: { 2061 // We are scalarizing the instruction. Return the cost of the scalar 2062 // instruction, plus the cost of insert and extract into vector 2063 // elements, times the vector width. 2064 unsigned Cost = 0; 2065 2066 bool IsVoid = RetTy->isVoidTy(); 2067 2068 unsigned InsCost = (IsVoid ? 0 : 2069 VTTI->getInstrCost(Instruction::InsertElement, 2070 VectorTy)); 2071 2072 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2073 VectorTy); 2074 2075 // The cost of inserting the results plus extracting each one of the 2076 // operands. 2077 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2078 2079 // The cost of executing VF copies of the scalar instruction. 2080 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2081 return Cost; 2082 } 2083 }// end of switch. 2084} 2085 2086Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2087 if (Scalar->isVoidTy() || VF == 1) 2088 return Scalar; 2089 return VectorType::get(Scalar, VF); 2090} 2091 2092} // namespace 2093 2094char LoopVectorize::ID = 0; 2095static const char lv_name[] = "Loop Vectorization"; 2096INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2097INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2098INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2099INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2100INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2101 2102namespace llvm { 2103 Pass *createLoopVectorizePass() { 2104 return new LoopVectorize(); 2105 } 2106} 2107 2108