LoopVectorize.cpp revision b1bf1eeede72b8c93505dd80fdf21aed0e205c7d
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. Legalization of the IR is done 12// in the codegen. However, the vectorizes uses (will use) the codegen 13// interfaces to generate IR that is likely to result in an optimal binary. 14// 15// The loop vectorizer combines consecutive loop iteration into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. SingleBlockLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28//===----------------------------------------------------------------------===// 29// 30// The reduction-variable vectorization is based on the paper: 31// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 32// 33// Variable uniformity checks are inspired by: 34// Karrenberg, R. and Hack, S. Whole Function Vectorization. 35// 36// Other ideas/concepts are from: 37// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 38// 39//===----------------------------------------------------------------------===// 40#define LV_NAME "loop-vectorize" 41#define DEBUG_TYPE LV_NAME 42#include "llvm/Constants.h" 43#include "llvm/DerivedTypes.h" 44#include "llvm/Instructions.h" 45#include "llvm/LLVMContext.h" 46#include "llvm/Pass.h" 47#include "llvm/Analysis/LoopPass.h" 48#include "llvm/Value.h" 49#include "llvm/Function.h" 50#include "llvm/Analysis/Verifier.h" 51#include "llvm/Module.h" 52#include "llvm/Type.h" 53#include "llvm/ADT/SmallVector.h" 54#include "llvm/ADT/StringExtras.h" 55#include "llvm/Analysis/AliasAnalysis.h" 56#include "llvm/Analysis/AliasSetTracker.h" 57#include "llvm/Analysis/ScalarEvolution.h" 58#include "llvm/Analysis/Dominators.h" 59#include "llvm/Analysis/ScalarEvolutionExpressions.h" 60#include "llvm/Analysis/ScalarEvolutionExpander.h" 61#include "llvm/Analysis/LoopInfo.h" 62#include "llvm/Analysis/ValueTracking.h" 63#include "llvm/Transforms/Scalar.h" 64#include "llvm/Transforms/Utils/BasicBlockUtils.h" 65#include "llvm/TargetTransformInfo.h" 66#include "llvm/Support/CommandLine.h" 67#include "llvm/Support/Debug.h" 68#include "llvm/Support/raw_ostream.h" 69#include "llvm/DataLayout.h" 70#include "llvm/Transforms/Utils/Local.h" 71#include <algorithm> 72using namespace llvm; 73 74static cl::opt<unsigned> 75VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 76 cl::desc("Set the default vectorization width. Zero is autoselect.")); 77 78/// We don't vectorize loops with a known constant trip count below this number. 79const unsigned TinyTripCountThreshold = 16; 80 81/// When performing a runtime memory check, do not check more than this 82/// numner of pointers. Notice that the check is quadratic! 83const unsigned RuntimeMemoryCheckThreshold = 2; 84 85namespace { 86 87// Forward declarations. 88class LoopVectorizationLegality; 89class LoopVectorizationCostModel; 90 91/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 92/// block to a specified vectorization factor (VF). 93/// This class performs the widening of scalars into vectors, or multiple 94/// scalars. This class also implements the following features: 95/// * It inserts an epilogue loop for handling loops that don't have iteration 96/// counts that are known to be a multiple of the vectorization factor. 97/// * It handles the code generation for reduction variables. 98/// * Scalarization (implementation using scalars) of un-vectorizable 99/// instructions. 100/// SingleBlockLoopVectorizer does not perform any vectorization-legality 101/// checks, and relies on the caller to check for the different legality 102/// aspects. The SingleBlockLoopVectorizer relies on the 103/// LoopVectorizationLegality class to provide information about the induction 104/// and reduction variables that were found to a given vectorization factor. 105class SingleBlockLoopVectorizer { 106public: 107 /// Ctor. 108 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 109 DominatorTree *dt, LPPassManager *Lpm, 110 unsigned VecWidth): 111 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), 112 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 113 114 // Perform the actual loop widening (vectorization). 115 void vectorize(LoopVectorizationLegality *Legal) { 116 ///Create a new empty loop. Unlink the old loop and connect the new one. 117 createEmptyLoop(Legal); 118 /// Widen each instruction in the old loop to a new one in the new loop. 119 /// Use the Legality module to find the induction and reduction variables. 120 vectorizeLoop(Legal); 121 // register the new loop. 122 updateAnalysis(); 123 } 124 125private: 126 /// Create an empty loop, based on the loop ranges of the old loop. 127 void createEmptyLoop(LoopVectorizationLegality *Legal); 128 /// Copy and widen the instructions from the old loop. 129 void vectorizeLoop(LoopVectorizationLegality *Legal); 130 /// Insert the new loop to the loop hierarchy and pass manager. 131 void updateAnalysis(); 132 133 /// This instruction is un-vectorizable. Implement it as a sequence 134 /// of scalars. 135 void scalarizeInstruction(Instruction *Instr); 136 137 /// Create a broadcast instruction. This method generates a broadcast 138 /// instruction (shuffle) for loop invariant values and for the induction 139 /// value. If this is the induction variable then we extend it to N, N+1, ... 140 /// this is needed because each iteration in the loop corresponds to a SIMD 141 /// element. 142 Value *getBroadcastInstrs(Value *V); 143 144 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 145 /// for each element in the vector. Starting from zero. 146 Value *getConsecutiveVector(Value* Val); 147 148 /// When we go over instructions in the basic block we rely on previous 149 /// values within the current basic block or on loop invariant values. 150 /// When we widen (vectorize) values we place them in the map. If the values 151 /// are not within the map, they have to be loop invariant, so we simply 152 /// broadcast them into a vector. 153 Value *getVectorValue(Value *V); 154 155 /// Get a uniform vector of constant integers. We use this to get 156 /// vectors of ones and zeros for the reduction code. 157 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 158 159 typedef DenseMap<Value*, Value*> ValueMap; 160 161 /// The original loop. 162 Loop *OrigLoop; 163 // Scev analysis to use. 164 ScalarEvolution *SE; 165 // Loop Info. 166 LoopInfo *LI; 167 // Dominator Tree. 168 DominatorTree *DT; 169 // Loop Pass Manager; 170 LPPassManager *LPM; 171 // The vectorization factor to use. 172 unsigned VF; 173 174 // The builder that we use 175 IRBuilder<> Builder; 176 177 // --- Vectorization state --- 178 179 /// The vector-loop preheader. 180 BasicBlock *LoopVectorPreHeader; 181 /// The scalar-loop preheader. 182 BasicBlock *LoopScalarPreHeader; 183 /// Middle Block between the vector and the scalar. 184 BasicBlock *LoopMiddleBlock; 185 ///The ExitBlock of the scalar loop. 186 BasicBlock *LoopExitBlock; 187 ///The vector loop body. 188 BasicBlock *LoopVectorBody; 189 ///The scalar loop body. 190 BasicBlock *LoopScalarBody; 191 ///The first bypass block. 192 BasicBlock *LoopBypassBlock; 193 194 /// The new Induction variable which was added to the new block. 195 PHINode *Induction; 196 /// The induction variable of the old basic block. 197 PHINode *OldInduction; 198 // Maps scalars to widened vectors. 199 ValueMap WidenMap; 200}; 201 202/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 203/// to what vectorization factor. 204/// This class does not look at the profitability of vectorization, only the 205/// legality. This class has two main kinds of checks: 206/// * Memory checks - The code in canVectorizeMemory checks if vectorization 207/// will change the order of memory accesses in a way that will change the 208/// correctness of the program. 209/// * Scalars checks - The code in canVectorizeBlock checks for a number 210/// of different conditions, such as the availability of a single induction 211/// variable, that all types are supported and vectorize-able, etc. 212/// This code reflects the capabilities of SingleBlockLoopVectorizer. 213/// This class is also used by SingleBlockLoopVectorizer for identifying 214/// induction variable and the different reduction variables. 215class LoopVectorizationLegality { 216public: 217 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 218 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 219 220 /// This represents the kinds of reductions that we support. 221 enum ReductionKind { 222 NoReduction, /// Not a reduction. 223 IntegerAdd, /// Sum of numbers. 224 IntegerMult, /// Product of numbers. 225 IntegerOr, /// Bitwise or logical OR of numbers. 226 IntegerAnd, /// Bitwise or logical AND of numbers. 227 IntegerXor /// Bitwise or logical XOR of numbers. 228 }; 229 230 /// This POD struct holds information about reduction variables. 231 struct ReductionDescriptor { 232 // Default C'tor 233 ReductionDescriptor(): 234 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 235 236 // C'tor. 237 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 238 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 239 240 // The starting value of the reduction. 241 // It does not have to be zero! 242 Value *StartValue; 243 // The instruction who's value is used outside the loop. 244 Instruction *LoopExitInstr; 245 // The kind of the reduction. 246 ReductionKind Kind; 247 }; 248 249 // This POD struct holds information about the memory runtime legality 250 // check that a group of pointers do not overlap. 251 struct RuntimePointerCheck { 252 /// This flag indicates if we need to add the runtime check. 253 bool Need; 254 /// Holds the pointers that we need to check. 255 SmallVector<Value*, 2> Pointers; 256 }; 257 258 /// ReductionList contains the reduction descriptors for all 259 /// of the reductions that were found in the loop. 260 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 261 262 /// Returns true if it is legal to vectorize this loop. 263 /// This does not mean that it is profitable to vectorize this 264 /// loop, only that it is legal to do so. 265 bool canVectorize(); 266 267 /// Returns the Induction variable. 268 PHINode *getInduction() {return Induction;} 269 270 /// Returns the reduction variables found in the loop. 271 ReductionList *getReductionVars() { return &Reductions; } 272 273 /// Check if the pointer returned by this GEP is consecutive 274 /// when the index is vectorized. This happens when the last 275 /// index of the GEP is consecutive, like the induction variable. 276 /// This check allows us to vectorize A[idx] into a wide load/store. 277 bool isConsecutiveGep(Value *Ptr); 278 279 /// Returns true if the value V is uniform within the loop. 280 bool isUniform(Value *V); 281 282 /// Returns true if this instruction will remain scalar after vectorization. 283 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 284 285 /// Returns the information that we collected about runtime memory check. 286 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 287private: 288 /// Check if a single basic block loop is vectorizable. 289 /// At this point we know that this is a loop with a constant trip count 290 /// and we only need to check individual instructions. 291 bool canVectorizeBlock(BasicBlock &BB); 292 293 /// When we vectorize loops we may change the order in which 294 /// we read and write from memory. This method checks if it is 295 /// legal to vectorize the code, considering only memory constrains. 296 /// Returns true if BB is vectorizable 297 bool canVectorizeMemory(BasicBlock &BB); 298 299 /// Returns True, if 'Phi' is the kind of reduction variable for type 300 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 301 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 302 /// Returns true if the instruction I can be a reduction variable of type 303 /// 'Kind'. 304 bool isReductionInstr(Instruction *I, ReductionKind Kind); 305 /// Returns True, if 'Phi' is an induction variable. 306 bool isInductionVariable(PHINode *Phi); 307 /// Return true if we 308 bool hasComputableBounds(Value *Ptr); 309 310 /// The loop that we evaluate. 311 Loop *TheLoop; 312 /// Scev analysis. 313 ScalarEvolution *SE; 314 /// DataLayout analysis. 315 DataLayout *DL; 316 317 // --- vectorization state --- // 318 319 /// Holds the induction variable. 320 PHINode *Induction; 321 /// Holds the reduction variables. 322 ReductionList Reductions; 323 /// Allowed outside users. This holds the reduction 324 /// vars which can be accessed from outside the loop. 325 SmallPtrSet<Value*, 4> AllowedExit; 326 /// This set holds the variables which are known to be uniform after 327 /// vectorization. 328 SmallPtrSet<Instruction*, 4> Uniforms; 329 /// We need to check that all of the pointers in this list are disjoint 330 /// at runtime. 331 RuntimePointerCheck PtrRtCheck; 332}; 333 334/// LoopVectorizationCostModel - estimates the expected speedups due to 335/// vectorization. 336/// In many cases vectorization is not profitable. This can happen because 337/// of a number of reasons. In this class we mainly attempt to predict 338/// the expected speedup/slowdowns due to the supported instruction set. 339/// We use the VectorTargetTransformInfo to query the different backends 340/// for the cost of different operations. 341class LoopVectorizationCostModel { 342public: 343 /// C'tor. 344 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 345 LoopVectorizationLegality *Leg, 346 const VectorTargetTransformInfo *Vtti): 347 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 348 349 /// Returns the most profitable vectorization factor for the loop that is 350 /// smaller or equal to the VF argument. This method checks every power 351 /// of two up to VF. 352 unsigned findBestVectorizationFactor(unsigned VF = 8); 353 354private: 355 /// Returns the expected execution cost. The unit of the cost does 356 /// not matter because we use the 'cost' units to compare different 357 /// vector widths. The cost that is returned is *not* normalized by 358 /// the factor width. 359 unsigned expectedCost(unsigned VF); 360 361 /// Returns the execution time cost of an instruction for a given vector 362 /// width. Vector width of one means scalar. 363 unsigned getInstructionCost(Instruction *I, unsigned VF); 364 365 /// A helper function for converting Scalar types to vector types. 366 /// If the incoming type is void, we return void. If the VF is 1, we return 367 /// the scalar type. 368 static Type* ToVectorTy(Type *Scalar, unsigned VF); 369 370 /// The loop that we evaluate. 371 Loop *TheLoop; 372 /// Scev analysis. 373 ScalarEvolution *SE; 374 375 /// Vectorization legality. 376 LoopVectorizationLegality *Legal; 377 /// Vector target information. 378 const VectorTargetTransformInfo *VTTI; 379}; 380 381struct LoopVectorize : public LoopPass { 382 static char ID; // Pass identification, replacement for typeid 383 384 LoopVectorize() : LoopPass(ID) { 385 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 386 } 387 388 ScalarEvolution *SE; 389 DataLayout *DL; 390 LoopInfo *LI; 391 TargetTransformInfo *TTI; 392 DominatorTree *DT; 393 394 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 395 // We only vectorize innermost loops. 396 if (!L->empty()) 397 return false; 398 399 SE = &getAnalysis<ScalarEvolution>(); 400 DL = getAnalysisIfAvailable<DataLayout>(); 401 LI = &getAnalysis<LoopInfo>(); 402 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 403 DT = &getAnalysis<DominatorTree>(); 404 405 DEBUG(dbgs() << "LV: Checking a loop in \"" << 406 L->getHeader()->getParent()->getName() << "\"\n"); 407 408 // Check if it is legal to vectorize the loop. 409 LoopVectorizationLegality LVL(L, SE, DL); 410 if (!LVL.canVectorize()) { 411 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 412 return false; 413 } 414 415 // Select the preffered vectorization factor. 416 unsigned VF = 1; 417 if (VectorizationFactor == 0) { 418 const VectorTargetTransformInfo *VTTI = 0; 419 if (TTI) 420 VTTI = TTI->getVectorTargetTransformInfo(); 421 // Use the cost model. 422 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 423 VF = CM.findBestVectorizationFactor(); 424 425 if (VF == 1) { 426 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 427 return false; 428 } 429 430 } else { 431 // Use the user command flag. 432 VF = VectorizationFactor; 433 } 434 435 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 436 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 437 "\n"); 438 439 // If we decided that it is *legal* to vectorizer the loop then do it. 440 SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); 441 LB.vectorize(&LVL); 442 443 DEBUG(verifyFunction(*L->getHeader()->getParent())); 444 return true; 445 } 446 447 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 448 LoopPass::getAnalysisUsage(AU); 449 AU.addRequiredID(LoopSimplifyID); 450 AU.addRequiredID(LCSSAID); 451 AU.addRequired<LoopInfo>(); 452 AU.addRequired<ScalarEvolution>(); 453 AU.addRequired<DominatorTree>(); 454 AU.addPreserved<LoopInfo>(); 455 AU.addPreserved<DominatorTree>(); 456 } 457 458}; 459 460Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 461 // Instructions that access the old induction variable 462 // actually want to get the new one. 463 if (V == OldInduction) 464 V = Induction; 465 // Create the types. 466 LLVMContext &C = V->getContext(); 467 Type *VTy = VectorType::get(V->getType(), VF); 468 Type *I32 = IntegerType::getInt32Ty(C); 469 Constant *Zero = ConstantInt::get(I32, 0); 470 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 471 Value *UndefVal = UndefValue::get(VTy); 472 // Insert the value into a new vector. 473 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 474 // Broadcast the scalar into all locations in the vector. 475 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 476 "broadcast"); 477 // We are accessing the induction variable. Make sure to promote the 478 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 479 if (V == Induction) 480 return getConsecutiveVector(Shuf); 481 return Shuf; 482} 483 484Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 485 assert(Val->getType()->isVectorTy() && "Must be a vector"); 486 assert(Val->getType()->getScalarType()->isIntegerTy() && 487 "Elem must be an integer"); 488 // Create the types. 489 Type *ITy = Val->getType()->getScalarType(); 490 VectorType *Ty = cast<VectorType>(Val->getType()); 491 unsigned VLen = Ty->getNumElements(); 492 SmallVector<Constant*, 8> Indices; 493 494 // Create a vector of consecutive numbers from zero to VF. 495 for (unsigned i = 0; i < VLen; ++i) 496 Indices.push_back(ConstantInt::get(ITy, i)); 497 498 // Add the consecutive indices to the vector value. 499 Constant *Cv = ConstantVector::get(Indices); 500 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 501 return Builder.CreateAdd(Val, Cv, "induction"); 502} 503 504bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { 505 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 506 if (!Gep) 507 return false; 508 509 unsigned NumOperands = Gep->getNumOperands(); 510 Value *LastIndex = Gep->getOperand(NumOperands - 1); 511 512 // Check that all of the gep indices are uniform except for the last. 513 for (unsigned i = 0; i < NumOperands - 1; ++i) 514 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 515 return false; 516 517 // We can emit wide load/stores only of the last index is the induction 518 // variable. 519 const SCEV *Last = SE->getSCEV(LastIndex); 520 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 521 const SCEV *Step = AR->getStepRecurrence(*SE); 522 523 // The memory is consecutive because the last index is consecutive 524 // and all other indices are loop invariant. 525 if (Step->isOne()) 526 return true; 527 } 528 529 return false; 530} 531 532bool LoopVectorizationLegality::isUniform(Value *V) { 533 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 534} 535 536Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 537 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 538 // If we saved a vectorized copy of V, use it. 539 Value *&MapEntry = WidenMap[V]; 540 if (MapEntry) 541 return MapEntry; 542 543 // Broadcast V and save the value for future uses. 544 Value *B = getBroadcastInstrs(V); 545 MapEntry = B; 546 return B; 547} 548 549Constant* 550SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 551 SmallVector<Constant*, 8> Indices; 552 // Create a vector of consecutive numbers from zero to VF. 553 for (unsigned i = 0; i < VF; ++i) 554 Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); 555 556 // Add the consecutive indices to the vector value. 557 return ConstantVector::get(Indices); 558} 559 560void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 561 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 562 // Holds vector parameters or scalars, in case of uniform vals. 563 SmallVector<Value*, 8> Params; 564 565 // Find all of the vectorized parameters. 566 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 567 Value *SrcOp = Instr->getOperand(op); 568 569 // If we are accessing the old induction variable, use the new one. 570 if (SrcOp == OldInduction) { 571 Params.push_back(getBroadcastInstrs(Induction)); 572 continue; 573 } 574 575 // Try using previously calculated values. 576 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 577 578 // If the src is an instruction that appeared earlier in the basic block 579 // then it should already be vectorized. 580 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 581 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 582 // The parameter is a vector value from earlier. 583 Params.push_back(WidenMap[SrcInst]); 584 } else { 585 // The parameter is a scalar from outside the loop. Maybe even a constant. 586 Params.push_back(SrcOp); 587 } 588 } 589 590 assert(Params.size() == Instr->getNumOperands() && 591 "Invalid number of operands"); 592 593 // Does this instruction return a value ? 594 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 595 Value *VecResults = 0; 596 597 // If we have a return value, create an empty vector. We place the scalarized 598 // instructions in this vector. 599 if (!IsVoidRetTy) 600 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 601 602 // For each scalar that we create: 603 for (unsigned i = 0; i < VF; ++i) { 604 Instruction *Cloned = Instr->clone(); 605 if (!IsVoidRetTy) 606 Cloned->setName(Instr->getName() + ".cloned"); 607 // Replace the operands of the cloned instrucions with extracted scalars. 608 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 609 Value *Op = Params[op]; 610 // Param is a vector. Need to extract the right lane. 611 if (Op->getType()->isVectorTy()) 612 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 613 Cloned->setOperand(op, Op); 614 } 615 616 // Place the cloned scalar in the new loop. 617 Builder.Insert(Cloned); 618 619 // If the original scalar returns a value we need to place it in a vector 620 // so that future users will be able to use it. 621 if (!IsVoidRetTy) 622 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 623 Builder.getInt32(i)); 624 } 625 626 if (!IsVoidRetTy) 627 WidenMap[Instr] = VecResults; 628} 629 630void 631SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 632 /* 633 In this function we generate a new loop. The new loop will contain 634 the vectorized instructions while the old loop will continue to run the 635 scalar remainder. 636 637 [ ] <-- vector loop bypass. 638 / | 639 / v 640| [ ] <-- vector pre header. 641| | 642| v 643| [ ] \ 644| [ ]_| <-- vector loop. 645| | 646 \ v 647 >[ ] <--- middle-block. 648 / | 649 / v 650| [ ] <--- new preheader. 651| | 652| v 653| [ ] \ 654| [ ]_| <-- old scalar loop to handle remainder. 655 \ | 656 \ v 657 >[ ] <-- exit block. 658 ... 659 */ 660 661 OldInduction = Legal->getInduction(); 662 assert(OldInduction && "We must have a single phi node."); 663 Type *IdxTy = OldInduction->getType(); 664 665 // Find the loop boundaries. 666 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 667 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 668 669 // Get the total trip count from the count by adding 1. 670 ExitCount = SE->getAddExpr(ExitCount, 671 SE->getConstant(ExitCount->getType(), 1)); 672 // We may need to extend the index in case there is a type mismatch. 673 // We know that the count starts at zero and does not overflow. 674 // We are using Zext because it should be less expensive. 675 if (ExitCount->getType() != IdxTy) 676 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 677 678 // This is the original scalar-loop preheader. 679 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 680 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 681 assert(ExitBlock && "Must have an exit block"); 682 683 // The loop index does not have to start at Zero. It starts with this value. 684 Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); 685 686 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 687 assert(BypassBlock && "Invalid loop structure"); 688 689 BasicBlock *VectorPH = 690 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 691 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 692 "vector.body"); 693 694 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 695 "middle.block"); 696 BasicBlock *ScalarPH = 697 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 698 "scalar.preheader"); 699 // Find the induction variable. 700 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 701 702 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 703 // inside the loop. 704 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 705 706 // Generate the induction variable. 707 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 708 Constant *Step = ConstantInt::get(IdxTy, VF); 709 710 // Expand the trip count and place the new instructions in the preheader. 711 // Notice that the pre-header does not change, only the loop body. 712 SCEVExpander Exp(*SE, "induction"); 713 Instruction *Loc = BypassBlock->getTerminator(); 714 715 // Count holds the overall loop count (N). 716 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 717 718 // Add the start index to the loop count to get the new end index. 719 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 720 721 // Now we need to generate the expression for N - (N % VF), which is 722 // the part that the vectorized body will execute. 723 Constant *CIVF = ConstantInt::get(IdxTy, VF); 724 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 725 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 726 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 727 "end.idx.rnd.down", Loc); 728 729 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 730 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 731 IdxEndRoundDown, 732 StartIdx, 733 "cmp.zero", Loc); 734 735 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 736 Legal->getRuntimePointerCheck(); 737 Value *MemoryRuntimeCheck = 0; 738 if (PtrRtCheck->Need) { 739 unsigned NumPointers = PtrRtCheck->Pointers.size(); 740 SmallVector<Value* , 2> Starts; 741 SmallVector<Value* , 2> Ends; 742 743 // Use this type for pointer arithmetic. 744 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 745 746 for (unsigned i=0; i < NumPointers; ++i) { 747 Value *Ptr = PtrRtCheck->Pointers[i]; 748 const SCEV *Sc = SE->getSCEV(Ptr); 749 750 if (SE->isLoopInvariant(Sc, OrigLoop)) { 751 DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << 752 *Ptr <<"\n"); 753 Starts.push_back(Ptr); 754 Ends.push_back(Ptr); 755 } else { 756 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 757 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 758 Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc); 759 const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 760 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 761 assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range."); 762 Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc); 763 Starts.push_back(Start); 764 Ends.push_back(End); 765 } 766 } 767 768 for (unsigned i=0; i < NumPointers; ++i) { 769 for (unsigned j=i+1; j < NumPointers; ++j) { 770 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 771 Starts[0], Ends[1], "bound0", Loc); 772 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 773 Starts[1], Ends[0], "bound1", Loc); 774 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 775 "found.conflict", Loc); 776 if (MemoryRuntimeCheck) { 777 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 778 MemoryRuntimeCheck, 779 IsConflict, 780 "conflict.rdx", Loc); 781 } else { 782 MemoryRuntimeCheck = IsConflict; 783 } 784 } 785 } 786 }// end of need-runtime-check code. 787 788 // If we are using memory runtime checks, include them in. 789 if (MemoryRuntimeCheck) { 790 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 791 "CntOrMem", Loc); 792 } 793 794 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 795 // Remove the old terminator. 796 Loc->eraseFromParent(); 797 798 // We are going to resume the execution of the scalar loop. 799 // This PHI decides on what number to start. If we come from the 800 // vector loop then we need to start with the end index minus the 801 // index modulo VF. If we come from a bypass edge then we need to start 802 // from the real start. 803 PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx", 804 MiddleBlock->getTerminator()); 805 ResumeIndex->addIncoming(StartIdx, BypassBlock); 806 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 807 808 // Add a check in the middle block to see if we have completed 809 // all of the iterations in the first vector loop. 810 // If (N - N%VF) == N, then we *don't* need to run the remainder. 811 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 812 ResumeIndex, "cmp.n", 813 MiddleBlock->getTerminator()); 814 815 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 816 // Remove the old terminator. 817 MiddleBlock->getTerminator()->eraseFromParent(); 818 819 // Create i+1 and fill the PHINode. 820 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 821 Induction->addIncoming(StartIdx, VectorPH); 822 Induction->addIncoming(NextIdx, VecBody); 823 // Create the compare. 824 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 825 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 826 827 // Now we have two terminators. Remove the old one from the block. 828 VecBody->getTerminator()->eraseFromParent(); 829 830 // Fix the scalar body iteration count. 831 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 832 OldInduction->setIncomingValue(BlockIdx, ResumeIndex); 833 834 // Get ready to start creating new instructions into the vectorized body. 835 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 836 837 // Register the new loop. 838 Loop* Lp = new Loop(); 839 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 840 841 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 842 843 Loop *ParentLoop = OrigLoop->getParentLoop(); 844 if (ParentLoop) { 845 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 846 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 847 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 848 } 849 850 // Save the state. 851 LoopVectorPreHeader = VectorPH; 852 LoopScalarPreHeader = ScalarPH; 853 LoopMiddleBlock = MiddleBlock; 854 LoopExitBlock = ExitBlock; 855 LoopVectorBody = VecBody; 856 LoopScalarBody = OldBasicBlock; 857 LoopBypassBlock = BypassBlock; 858} 859 860/// This function returns the identity element (or neutral element) for 861/// the operation K. 862static unsigned 863getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 864 switch (K) { 865 case LoopVectorizationLegality::IntegerXor: 866 case LoopVectorizationLegality::IntegerAdd: 867 case LoopVectorizationLegality::IntegerOr: 868 // Adding, Xoring, Oring zero to a number does not change it. 869 return 0; 870 case LoopVectorizationLegality::IntegerMult: 871 // Multiplying a number by 1 does not change it. 872 return 1; 873 case LoopVectorizationLegality::IntegerAnd: 874 // AND-ing a number with an all-1 value does not change it. 875 return -1; 876 default: 877 llvm_unreachable("Unknown reduction kind"); 878 } 879} 880 881void 882SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 883 //===------------------------------------------------===// 884 // 885 // Notice: any optimization or new instruction that go 886 // into the code below should be also be implemented in 887 // the cost-model. 888 // 889 //===------------------------------------------------===// 890 typedef SmallVector<PHINode*, 4> PhiVector; 891 BasicBlock &BB = *OrigLoop->getHeader(); 892 Constant *Zero = ConstantInt::get( 893 IntegerType::getInt32Ty(BB.getContext()), 0); 894 895 // In order to support reduction variables we need to be able to vectorize 896 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 897 // steages. First, we create a new vector PHI node with no incoming edges. 898 // We use this value when we vectorize all of the instructions that use the 899 // PHI. Next, after all of the instructions in the block are complete we 900 // add the new incoming edges to the PHI. At this point all of the 901 // instructions in the basic block are vectorized, so we can use them to 902 // construct the PHI. 903 PhiVector PHIsToFix; 904 905 // For each instruction in the old loop. 906 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 907 Instruction *Inst = it; 908 909 switch (Inst->getOpcode()) { 910 case Instruction::Br: 911 // Nothing to do for PHIs and BR, since we already took care of the 912 // loop control flow instructions. 913 continue; 914 case Instruction::PHI:{ 915 PHINode* P = cast<PHINode>(Inst); 916 // Special handling for the induction var. 917 if (OldInduction == Inst) 918 continue; 919 // This is phase one of vectorizing PHIs. 920 // This has to be a reduction variable. 921 assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); 922 Type *VecTy = VectorType::get(Inst->getType(), VF); 923 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); 924 PHIsToFix.push_back(P); 925 continue; 926 } 927 case Instruction::Add: 928 case Instruction::FAdd: 929 case Instruction::Sub: 930 case Instruction::FSub: 931 case Instruction::Mul: 932 case Instruction::FMul: 933 case Instruction::UDiv: 934 case Instruction::SDiv: 935 case Instruction::FDiv: 936 case Instruction::URem: 937 case Instruction::SRem: 938 case Instruction::FRem: 939 case Instruction::Shl: 940 case Instruction::LShr: 941 case Instruction::AShr: 942 case Instruction::And: 943 case Instruction::Or: 944 case Instruction::Xor: { 945 // Just widen binops. 946 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 947 Value *A = getVectorValue(Inst->getOperand(0)); 948 Value *B = getVectorValue(Inst->getOperand(1)); 949 950 // Use this vector value for all users of the original instruction. 951 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 952 WidenMap[Inst] = V; 953 954 // Update the NSW, NUW and Exact flags. 955 BinaryOperator *VecOp = cast<BinaryOperator>(V); 956 if (isa<OverflowingBinaryOperator>(BinOp)) { 957 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 958 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 959 } 960 if (isa<PossiblyExactOperator>(VecOp)) 961 VecOp->setIsExact(BinOp->isExact()); 962 break; 963 } 964 case Instruction::Select: { 965 // Widen selects. 966 // If the selector is loop invariant we can create a select 967 // instruction with a scalar condition. Otherwise, use vector-select. 968 Value *Cond = Inst->getOperand(0); 969 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 970 971 // The condition can be loop invariant but still defined inside the 972 // loop. This means that we can't just use the original 'cond' value. 973 // We have to take the 'vectorized' value and pick the first lane. 974 // Instcombine will make this a no-op. 975 Cond = getVectorValue(Cond); 976 if (InvariantCond) 977 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 978 979 Value *Op0 = getVectorValue(Inst->getOperand(1)); 980 Value *Op1 = getVectorValue(Inst->getOperand(2)); 981 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 982 break; 983 } 984 985 case Instruction::ICmp: 986 case Instruction::FCmp: { 987 // Widen compares. Generate vector compares. 988 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 989 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 990 Value *A = getVectorValue(Inst->getOperand(0)); 991 Value *B = getVectorValue(Inst->getOperand(1)); 992 if (FCmp) 993 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 994 else 995 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 996 break; 997 } 998 999 case Instruction::Store: { 1000 // Attempt to issue a wide store. 1001 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1002 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1003 Value *Ptr = SI->getPointerOperand(); 1004 unsigned Alignment = SI->getAlignment(); 1005 1006 assert(!Legal->isUniform(Ptr) && 1007 "We do not allow storing to uniform addresses"); 1008 1009 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1010 1011 // This store does not use GEPs. 1012 if (!Legal->isConsecutiveGep(Gep)) { 1013 scalarizeInstruction(Inst); 1014 break; 1015 } 1016 1017 // The last index does not have to be the induction. It can be 1018 // consecutive and be a function of the index. For example A[I+1]; 1019 unsigned NumOperands = Gep->getNumOperands(); 1020 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1021 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1022 1023 // Create the new GEP with the new induction variable. 1024 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1025 Gep2->setOperand(NumOperands - 1, LastIndex); 1026 Ptr = Builder.Insert(Gep2); 1027 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1028 Value *Val = getVectorValue(SI->getValueOperand()); 1029 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1030 break; 1031 } 1032 case Instruction::Load: { 1033 // Attempt to issue a wide load. 1034 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1035 Type *RetTy = VectorType::get(LI->getType(), VF); 1036 Value *Ptr = LI->getPointerOperand(); 1037 unsigned Alignment = LI->getAlignment(); 1038 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1039 1040 // If we don't have a gep, or that the pointer is loop invariant, 1041 // scalarize the load. 1042 if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) { 1043 scalarizeInstruction(Inst); 1044 break; 1045 } 1046 1047 // The last index does not have to be the induction. It can be 1048 // consecutive and be a function of the index. For example A[I+1]; 1049 unsigned NumOperands = Gep->getNumOperands(); 1050 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1051 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1052 1053 // Create the new GEP with the new induction variable. 1054 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1055 Gep2->setOperand(NumOperands - 1, LastIndex); 1056 Ptr = Builder.Insert(Gep2); 1057 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1058 LI = Builder.CreateLoad(Ptr); 1059 LI->setAlignment(Alignment); 1060 // Use this vector value for all users of the load. 1061 WidenMap[Inst] = LI; 1062 break; 1063 } 1064 case Instruction::ZExt: 1065 case Instruction::SExt: 1066 case Instruction::FPToUI: 1067 case Instruction::FPToSI: 1068 case Instruction::FPExt: 1069 case Instruction::PtrToInt: 1070 case Instruction::IntToPtr: 1071 case Instruction::SIToFP: 1072 case Instruction::UIToFP: 1073 case Instruction::Trunc: 1074 case Instruction::FPTrunc: 1075 case Instruction::BitCast: { 1076 /// Vectorize bitcasts. 1077 CastInst *CI = dyn_cast<CastInst>(Inst); 1078 Value *A = getVectorValue(Inst->getOperand(0)); 1079 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1080 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1081 break; 1082 } 1083 1084 default: 1085 /// All other instructions are unsupported. Scalarize them. 1086 scalarizeInstruction(Inst); 1087 break; 1088 }// end of switch. 1089 }// end of for_each instr. 1090 1091 // At this point every instruction in the original loop is widended to 1092 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1093 // that we vectorized. The PHI nodes are currently empty because we did 1094 // not want to introduce cycles. Notice that the remaining PHI nodes 1095 // that we need to fix are reduction variables. 1096 1097 // Create the 'reduced' values for each of the induction vars. 1098 // The reduced values are the vector values that we scalarize and combine 1099 // after the loop is finished. 1100 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); 1101 it != e; ++it) { 1102 PHINode *RdxPhi = *it; 1103 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1104 assert(RdxPhi && "Unable to recover vectorized PHI"); 1105 1106 // Find the reduction variable descriptor. 1107 assert(Legal->getReductionVars()->count(RdxPhi) && 1108 "Unable to find the reduction variable"); 1109 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1110 (*Legal->getReductionVars())[RdxPhi]; 1111 1112 // We need to generate a reduction vector from the incoming scalar. 1113 // To do so, we need to generate the 'identity' vector and overide 1114 // one of the elements with the incoming scalar reduction. We need 1115 // to do it in the vector-loop preheader. 1116 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1117 1118 // This is the vector-clone of the value that leaves the loop. 1119 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1120 Type *VecTy = VectorExit->getType(); 1121 1122 // Find the reduction identity variable. Zero for addition, or, xor, 1123 // one for multiplication, -1 for And. 1124 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1125 VecTy->getScalarType()); 1126 1127 // This vector is the Identity vector where the first element is the 1128 // incoming scalar reduction. 1129 Value *VectorStart = Builder.CreateInsertElement(Identity, 1130 RdxDesc.StartValue, Zero); 1131 1132 1133 // Fix the vector-loop phi. 1134 // We created the induction variable so we know that the 1135 // preheader is the first entry. 1136 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1137 1138 // Reductions do not have to start at zero. They can start with 1139 // any loop invariant values. 1140 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1141 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1142 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1143 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1144 1145 // Before each round, move the insertion point right between 1146 // the PHIs and the values we are going to write. 1147 // This allows us to write both PHINodes and the extractelement 1148 // instructions. 1149 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1150 1151 // This PHINode contains the vectorized reduction variable, or 1152 // the initial value vector, if we bypass the vector loop. 1153 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1154 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1155 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1156 1157 // Extract the first scalar. 1158 Value *Scalar0 = 1159 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1160 // Extract and reduce the remaining vector elements. 1161 for (unsigned i=1; i < VF; ++i) { 1162 Value *Scalar1 = 1163 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1164 switch (RdxDesc.Kind) { 1165 case LoopVectorizationLegality::IntegerAdd: 1166 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1167 break; 1168 case LoopVectorizationLegality::IntegerMult: 1169 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1170 break; 1171 case LoopVectorizationLegality::IntegerOr: 1172 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1173 break; 1174 case LoopVectorizationLegality::IntegerAnd: 1175 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1176 break; 1177 case LoopVectorizationLegality::IntegerXor: 1178 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1179 break; 1180 default: 1181 llvm_unreachable("Unknown reduction operation"); 1182 } 1183 } 1184 1185 // Now, we need to fix the users of the reduction variable 1186 // inside and outside of the scalar remainder loop. 1187 // We know that the loop is in LCSSA form. We need to update the 1188 // PHI nodes in the exit blocks. 1189 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1190 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1191 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1192 if (!LCSSAPhi) continue; 1193 1194 // All PHINodes need to have a single entry edge, or two if 1195 // we already fixed them. 1196 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1197 1198 // We found our reduction value exit-PHI. Update it with the 1199 // incoming bypass edge. 1200 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1201 // Add an edge coming from the bypass. 1202 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1203 break; 1204 } 1205 }// end of the LCSSA phi scan. 1206 1207 // Fix the scalar loop reduction variable with the incoming reduction sum 1208 // from the vector body and from the backedge value. 1209 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1210 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1211 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1212 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1213 }// end of for each redux variable. 1214} 1215 1216void SingleBlockLoopVectorizer::updateAnalysis() { 1217 // The original basic block. 1218 SE->forgetLoop(OrigLoop); 1219 1220 // Update the dominator tree information. 1221 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1222 "Entry does not dominate exit."); 1223 1224 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1225 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1226 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1227 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1228 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1229 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1230 1231 DEBUG(DT->verifyAnalysis()); 1232} 1233 1234bool LoopVectorizationLegality::canVectorize() { 1235 if (!TheLoop->getLoopPreheader()) { 1236 assert(false && "No preheader!!"); 1237 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1238 return false; 1239 } 1240 1241 // We can only vectorize single basic block loops. 1242 unsigned NumBlocks = TheLoop->getNumBlocks(); 1243 if (NumBlocks != 1) { 1244 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1245 return false; 1246 } 1247 1248 // We need to have a loop header. 1249 BasicBlock *BB = TheLoop->getHeader(); 1250 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1251 1252 // ScalarEvolution needs to be able to find the exit count. 1253 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1254 if (ExitCount == SE->getCouldNotCompute()) { 1255 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1256 return false; 1257 } 1258 1259 // Do not loop-vectorize loops with a tiny trip count. 1260 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1261 if (TC > 0u && TC < TinyTripCountThreshold) { 1262 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1263 "This loop is not worth vectorizing.\n"); 1264 return false; 1265 } 1266 1267 // Go over each instruction and look at memory deps. 1268 if (!canVectorizeBlock(*BB)) { 1269 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1270 return false; 1271 } 1272 1273 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1274 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1275 <<"!\n"); 1276 1277 // Okay! We can vectorize. At this point we don't have any other mem analysis 1278 // which may limit our maximum vectorization factor, so just return true with 1279 // no restrictions. 1280 return true; 1281} 1282 1283bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1284 // Scan the instructions in the block and look for hazards. 1285 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1286 Instruction *I = it; 1287 1288 PHINode *Phi = dyn_cast<PHINode>(I); 1289 if (Phi) { 1290 // This should not happen because the loop should be normalized. 1291 if (Phi->getNumIncomingValues() != 2) { 1292 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1293 return false; 1294 } 1295 // We only look at integer phi nodes. 1296 if (!Phi->getType()->isIntegerTy()) { 1297 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 1298 return false; 1299 } 1300 1301 if (isInductionVariable(Phi)) { 1302 if (Induction) { 1303 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1304 return false; 1305 } 1306 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1307 Induction = Phi; 1308 continue; 1309 } 1310 if (AddReductionVar(Phi, IntegerAdd)) { 1311 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1312 continue; 1313 } 1314 if (AddReductionVar(Phi, IntegerMult)) { 1315 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1316 continue; 1317 } 1318 if (AddReductionVar(Phi, IntegerOr)) { 1319 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1320 continue; 1321 } 1322 if (AddReductionVar(Phi, IntegerAnd)) { 1323 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1324 continue; 1325 } 1326 if (AddReductionVar(Phi, IntegerXor)) { 1327 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1328 continue; 1329 } 1330 1331 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1332 return false; 1333 }// end of PHI handling 1334 1335 // We still don't handle functions. 1336 CallInst *CI = dyn_cast<CallInst>(I); 1337 if (CI) { 1338 DEBUG(dbgs() << "LV: Found a call site.\n"); 1339 return false; 1340 } 1341 1342 // We do not re-vectorize vectors. 1343 if (!VectorType::isValidElementType(I->getType()) && 1344 !I->getType()->isVoidTy()) { 1345 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1346 return false; 1347 } 1348 1349 // Reduction instructions are allowed to have exit users. 1350 // All other instructions must not have external users. 1351 if (!AllowedExit.count(I)) 1352 //Check that all of the users of the loop are inside the BB. 1353 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1354 it != e; ++it) { 1355 Instruction *U = cast<Instruction>(*it); 1356 // This user may be a reduction exit value. 1357 BasicBlock *Parent = U->getParent(); 1358 if (Parent != &BB) { 1359 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1360 return false; 1361 } 1362 } 1363 } // next instr. 1364 1365 if (!Induction) { 1366 DEBUG(dbgs() << "LV: Did not find an induction var.\n"); 1367 return false; 1368 } 1369 1370 // Don't vectorize if the memory dependencies do not allow vectorization. 1371 if (!canVectorizeMemory(BB)) 1372 return false; 1373 1374 // We now know that the loop is vectorizable! 1375 // Collect variables that will remain uniform after vectorization. 1376 std::vector<Value*> Worklist; 1377 1378 // Start with the conditional branch and walk up the block. 1379 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1380 1381 while (Worklist.size()) { 1382 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1383 Worklist.pop_back(); 1384 // Look at instructions inside this block. 1385 if (!I) continue; 1386 if (I->getParent() != &BB) continue; 1387 1388 // Stop when reaching PHI nodes. 1389 if (isa<PHINode>(I)) { 1390 assert(I == Induction && "Found a uniform PHI that is not the induction"); 1391 break; 1392 } 1393 1394 // This is a known uniform. 1395 Uniforms.insert(I); 1396 1397 // Insert all operands. 1398 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1399 Worklist.push_back(I->getOperand(i)); 1400 } 1401 } 1402 1403 return true; 1404} 1405 1406bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1407 typedef SmallVector<Value*, 16> ValueVector; 1408 typedef SmallPtrSet<Value*, 16> ValueSet; 1409 // Holds the Load and Store *instructions*. 1410 ValueVector Loads; 1411 ValueVector Stores; 1412 PtrRtCheck.Pointers.clear(); 1413 PtrRtCheck.Need = false; 1414 1415 // Scan the BB and collect legal loads and stores. 1416 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1417 Instruction *I = it; 1418 1419 // If this is a load, save it. If this instruction can read from memory 1420 // but is not a load, then we quit. Notice that we don't handle function 1421 // calls that read or write. 1422 if (I->mayReadFromMemory()) { 1423 LoadInst *Ld = dyn_cast<LoadInst>(I); 1424 if (!Ld) return false; 1425 if (!Ld->isSimple()) { 1426 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1427 return false; 1428 } 1429 Loads.push_back(Ld); 1430 continue; 1431 } 1432 1433 // Save store instructions. Abort if other instructions write to memory. 1434 if (I->mayWriteToMemory()) { 1435 StoreInst *St = dyn_cast<StoreInst>(I); 1436 if (!St) return false; 1437 if (!St->isSimple()) { 1438 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1439 return false; 1440 } 1441 Stores.push_back(St); 1442 } 1443 } // next instr. 1444 1445 // Now we have two lists that hold the loads and the stores. 1446 // Next, we find the pointers that they use. 1447 1448 // Check if we see any stores. If there are no stores, then we don't 1449 // care if the pointers are *restrict*. 1450 if (!Stores.size()) { 1451 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1452 return true; 1453 } 1454 1455 // Holds the read and read-write *pointers* that we find. 1456 ValueVector Reads; 1457 ValueVector ReadWrites; 1458 1459 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1460 // multiple times on the same object. If the ptr is accessed twice, once 1461 // for read and once for write, it will only appear once (on the write 1462 // list). This is okay, since we are going to check for conflicts between 1463 // writes and between reads and writes, but not between reads and reads. 1464 ValueSet Seen; 1465 1466 ValueVector::iterator I, IE; 1467 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1468 StoreInst *ST = dyn_cast<StoreInst>(*I); 1469 assert(ST && "Bad StoreInst"); 1470 Value* Ptr = ST->getPointerOperand(); 1471 1472 if (isUniform(Ptr)) { 1473 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1474 return false; 1475 } 1476 1477 // If we did *not* see this pointer before, insert it to 1478 // the read-write list. At this phase it is only a 'write' list. 1479 if (Seen.insert(Ptr)) 1480 ReadWrites.push_back(Ptr); 1481 } 1482 1483 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1484 LoadInst *LD = dyn_cast<LoadInst>(*I); 1485 assert(LD && "Bad LoadInst"); 1486 Value* Ptr = LD->getPointerOperand(); 1487 // If we did *not* see this pointer before, insert it to the 1488 // read list. If we *did* see it before, then it is already in 1489 // the read-write list. This allows us to vectorize expressions 1490 // such as A[i] += x; Because the address of A[i] is a read-write 1491 // pointer. This only works if the index of A[i] is consecutive. 1492 // If the address of i is unknown (for example A[B[i]]) then we may 1493 // read a few words, modify, and write a few words, and some of the 1494 // words may be written to the same address. 1495 if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) 1496 Reads.push_back(Ptr); 1497 } 1498 1499 // If we write (or read-write) to a single destination and there are no 1500 // other reads in this loop then is it safe to vectorize. 1501 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1502 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1503 return true; 1504 } 1505 1506 // Find pointers with computable bounds. We are going to use this information 1507 // to place a runtime bound check. 1508 bool RT = true; 1509 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1510 if (hasComputableBounds(*I)) { 1511 PtrRtCheck.Pointers.push_back(*I); 1512 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1513 } else { 1514 RT = false; 1515 break; 1516 } 1517 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1518 if (hasComputableBounds(*I)) { 1519 PtrRtCheck.Pointers.push_back(*I); 1520 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1521 } else { 1522 RT = false; 1523 break; 1524 } 1525 1526 // Check that we did not collect too many pointers or found a 1527 // unsizeable pointer. 1528 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1529 PtrRtCheck.Pointers.clear(); 1530 RT = false; 1531 } 1532 1533 PtrRtCheck.Need = RT; 1534 1535 if (RT) { 1536 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1537 } 1538 1539 // Now that the pointers are in two lists (Reads and ReadWrites), we 1540 // can check that there are no conflicts between each of the writes and 1541 // between the writes to the reads. 1542 ValueSet WriteObjects; 1543 ValueVector TempObjects; 1544 1545 // Check that the read-writes do not conflict with other read-write 1546 // pointers. 1547 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1548 GetUnderlyingObjects(*I, TempObjects, DL); 1549 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1550 it != e; ++it) { 1551 if (!isIdentifiedObject(*it)) { 1552 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1553 return RT; 1554 } 1555 if (!WriteObjects.insert(*it)) { 1556 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1557 << **it <<"\n"); 1558 return RT; 1559 } 1560 } 1561 TempObjects.clear(); 1562 } 1563 1564 /// Check that the reads don't conflict with the read-writes. 1565 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1566 GetUnderlyingObjects(*I, TempObjects, DL); 1567 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1568 it != e; ++it) { 1569 if (!isIdentifiedObject(*it)) { 1570 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1571 return RT; 1572 } 1573 if (WriteObjects.count(*it)) { 1574 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1575 << **it <<"\n"); 1576 return RT; 1577 } 1578 } 1579 TempObjects.clear(); 1580 } 1581 1582 // It is safe to vectorize and we don't need any runtime checks. 1583 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1584 PtrRtCheck.Pointers.clear(); 1585 PtrRtCheck.Need = false; 1586 return true; 1587} 1588 1589bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1590 ReductionKind Kind) { 1591 if (Phi->getNumIncomingValues() != 2) 1592 return false; 1593 1594 // Find the possible incoming reduction variable. 1595 BasicBlock *BB = Phi->getParent(); 1596 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1597 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1598 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1599 1600 // ExitInstruction is the single value which is used outside the loop. 1601 // We only allow for a single reduction value to be used outside the loop. 1602 // This includes users of the reduction, variables (which form a cycle 1603 // which ends in the phi node). 1604 Instruction *ExitInstruction = 0; 1605 1606 // Iter is our iterator. We start with the PHI node and scan for all of the 1607 // users of this instruction. All users must be instructions which can be 1608 // used as reduction variables (such as ADD). We may have a single 1609 // out-of-block user. They cycle must end with the original PHI. 1610 // Also, we can't have multiple block-local users. 1611 Instruction *Iter = Phi; 1612 while (true) { 1613 // Any reduction instr must be of one of the allowed kinds. 1614 if (!isReductionInstr(Iter, Kind)) 1615 return false; 1616 1617 // Did we found a user inside this block ? 1618 bool FoundInBlockUser = false; 1619 // Did we reach the initial PHI node ? 1620 bool FoundStartPHI = false; 1621 1622 // If the instruction has no users then this is a broken 1623 // chain and can't be a reduction variable. 1624 if (Iter->use_empty()) 1625 return false; 1626 1627 // For each of the *users* of iter. 1628 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1629 it != e; ++it) { 1630 Instruction *U = cast<Instruction>(*it); 1631 // We already know that the PHI is a user. 1632 if (U == Phi) { 1633 FoundStartPHI = true; 1634 continue; 1635 } 1636 // Check if we found the exit user. 1637 BasicBlock *Parent = U->getParent(); 1638 if (Parent != BB) { 1639 // We must have a single exit instruction. 1640 if (ExitInstruction != 0) 1641 return false; 1642 ExitInstruction = Iter; 1643 } 1644 // We can't have multiple inside users. 1645 if (FoundInBlockUser) 1646 return false; 1647 FoundInBlockUser = true; 1648 Iter = U; 1649 } 1650 1651 // We found a reduction var if we have reached the original 1652 // phi node and we only have a single instruction with out-of-loop 1653 // users. 1654 if (FoundStartPHI && ExitInstruction) { 1655 // This instruction is allowed to have out-of-loop users. 1656 AllowedExit.insert(ExitInstruction); 1657 1658 // Save the description of this reduction variable. 1659 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1660 Reductions[Phi] = RD; 1661 return true; 1662 } 1663 } 1664} 1665 1666bool 1667LoopVectorizationLegality::isReductionInstr(Instruction *I, 1668 ReductionKind Kind) { 1669 switch (I->getOpcode()) { 1670 default: 1671 return false; 1672 case Instruction::PHI: 1673 // possibly. 1674 return true; 1675 case Instruction::Add: 1676 case Instruction::Sub: 1677 return Kind == IntegerAdd; 1678 case Instruction::Mul: 1679 case Instruction::UDiv: 1680 case Instruction::SDiv: 1681 return Kind == IntegerMult; 1682 case Instruction::And: 1683 return Kind == IntegerAnd; 1684 case Instruction::Or: 1685 return Kind == IntegerOr; 1686 case Instruction::Xor: 1687 return Kind == IntegerXor; 1688 } 1689} 1690 1691bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1692 // Check that the PHI is consecutive and starts at zero. 1693 const SCEV *PhiScev = SE->getSCEV(Phi); 1694 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1695 if (!AR) { 1696 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1697 return false; 1698 } 1699 const SCEV *Step = AR->getStepRecurrence(*SE); 1700 1701 if (!Step->isOne()) { 1702 DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); 1703 return false; 1704 } 1705 return true; 1706} 1707 1708bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1709 const SCEV *PhiScev = SE->getSCEV(Ptr); 1710 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1711 if (!AR) 1712 return false; 1713 1714 return AR->isAffine(); 1715} 1716 1717unsigned 1718LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1719 if (!VTTI) { 1720 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1721 return 1; 1722 } 1723 1724 float Cost = expectedCost(1); 1725 unsigned Width = 1; 1726 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1727 for (unsigned i=2; i <= VF; i*=2) { 1728 // Notice that the vector loop needs to be executed less times, so 1729 // we need to divide the cost of the vector loops by the width of 1730 // the vector elements. 1731 float VectorCost = expectedCost(i) / (float)i; 1732 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1733 (int)VectorCost << ".\n"); 1734 if (VectorCost < Cost) { 1735 Cost = VectorCost; 1736 Width = i; 1737 } 1738 } 1739 1740 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1741 return Width; 1742} 1743 1744unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1745 // We can only estimate the cost of single basic block loops. 1746 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1747 1748 BasicBlock *BB = TheLoop->getHeader(); 1749 unsigned Cost = 0; 1750 1751 // For each instruction in the old loop. 1752 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1753 Instruction *Inst = it; 1754 unsigned C = getInstructionCost(Inst, VF); 1755 Cost += C; 1756 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1757 " For instruction: "<< *Inst << "\n"); 1758 } 1759 1760 return Cost; 1761} 1762 1763unsigned 1764LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1765 assert(VTTI && "Invalid vector target transformation info"); 1766 1767 // If we know that this instruction will remain uniform, check the cost of 1768 // the scalar version. 1769 if (Legal->isUniformAfterVectorization(I)) 1770 VF = 1; 1771 1772 Type *RetTy = I->getType(); 1773 Type *VectorTy = ToVectorTy(RetTy, VF); 1774 1775 1776 // TODO: We need to estimate the cost of intrinsic calls. 1777 switch (I->getOpcode()) { 1778 case Instruction::GetElementPtr: 1779 // We mark this instruction as zero-cost because scalar GEPs are usually 1780 // lowered to the intruction addressing mode. At the moment we don't 1781 // generate vector geps. 1782 return 0; 1783 case Instruction::Br: { 1784 return VTTI->getCFInstrCost(I->getOpcode()); 1785 } 1786 case Instruction::PHI: 1787 return 0; 1788 case Instruction::Add: 1789 case Instruction::FAdd: 1790 case Instruction::Sub: 1791 case Instruction::FSub: 1792 case Instruction::Mul: 1793 case Instruction::FMul: 1794 case Instruction::UDiv: 1795 case Instruction::SDiv: 1796 case Instruction::FDiv: 1797 case Instruction::URem: 1798 case Instruction::SRem: 1799 case Instruction::FRem: 1800 case Instruction::Shl: 1801 case Instruction::LShr: 1802 case Instruction::AShr: 1803 case Instruction::And: 1804 case Instruction::Or: 1805 case Instruction::Xor: { 1806 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1807 } 1808 case Instruction::Select: { 1809 SelectInst *SI = cast<SelectInst>(I); 1810 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1811 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1812 Type *CondTy = SI->getCondition()->getType(); 1813 if (ScalarCond) 1814 CondTy = VectorType::get(CondTy, VF); 1815 1816 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1817 } 1818 case Instruction::ICmp: 1819 case Instruction::FCmp: { 1820 Type *ValTy = I->getOperand(0)->getType(); 1821 VectorTy = ToVectorTy(ValTy, VF); 1822 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1823 } 1824 case Instruction::Store: { 1825 StoreInst *SI = cast<StoreInst>(I); 1826 Type *ValTy = SI->getValueOperand()->getType(); 1827 VectorTy = ToVectorTy(ValTy, VF); 1828 1829 if (VF == 1) 1830 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1831 SI->getAlignment(), SI->getPointerAddressSpace()); 1832 1833 // Scalarized stores. 1834 if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { 1835 unsigned Cost = 0; 1836 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1837 ValTy); 1838 // The cost of extracting from the value vector. 1839 Cost += VF * (ExtCost); 1840 // The cost of the scalar stores. 1841 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1842 ValTy->getScalarType(), 1843 SI->getAlignment(), 1844 SI->getPointerAddressSpace()); 1845 return Cost; 1846 } 1847 1848 // Wide stores. 1849 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 1850 SI->getPointerAddressSpace()); 1851 } 1852 case Instruction::Load: { 1853 LoadInst *LI = cast<LoadInst>(I); 1854 1855 if (VF == 1) 1856 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 1857 LI->getAlignment(), 1858 LI->getPointerAddressSpace()); 1859 1860 // Scalarized loads. 1861 if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { 1862 unsigned Cost = 0; 1863 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 1864 // The cost of inserting the loaded value into the result vector. 1865 Cost += VF * (InCost); 1866 // The cost of the scalar stores. 1867 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1868 RetTy->getScalarType(), 1869 LI->getAlignment(), 1870 LI->getPointerAddressSpace()); 1871 return Cost; 1872 } 1873 1874 // Wide loads. 1875 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 1876 LI->getPointerAddressSpace()); 1877 } 1878 case Instruction::ZExt: 1879 case Instruction::SExt: 1880 case Instruction::FPToUI: 1881 case Instruction::FPToSI: 1882 case Instruction::FPExt: 1883 case Instruction::PtrToInt: 1884 case Instruction::IntToPtr: 1885 case Instruction::SIToFP: 1886 case Instruction::UIToFP: 1887 case Instruction::Trunc: 1888 case Instruction::FPTrunc: 1889 case Instruction::BitCast: { 1890 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 1891 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 1892 } 1893 default: { 1894 // We are scalarizing the instruction. Return the cost of the scalar 1895 // instruction, plus the cost of insert and extract into vector 1896 // elements, times the vector width. 1897 unsigned Cost = 0; 1898 1899 bool IsVoid = RetTy->isVoidTy(); 1900 1901 unsigned InsCost = (IsVoid ? 0 : 1902 VTTI->getInstrCost(Instruction::InsertElement, 1903 VectorTy)); 1904 1905 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1906 VectorTy); 1907 1908 // The cost of inserting the results plus extracting each one of the 1909 // operands. 1910 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 1911 1912 // The cost of executing VF copies of the scalar instruction. 1913 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 1914 return Cost; 1915 } 1916 }// end of switch. 1917} 1918 1919Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 1920 if (Scalar->isVoidTy() || VF == 1) 1921 return Scalar; 1922 return VectorType::get(Scalar, VF); 1923} 1924 1925} // namespace 1926 1927char LoopVectorize::ID = 0; 1928static const char lv_name[] = "Loop Vectorization"; 1929INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 1930INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1931INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1932INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1933INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 1934 1935namespace llvm { 1936 Pass *createLoopVectorizePass() { 1937 return new LoopVectorize(); 1938 } 1939} 1940 1941