LoopVectorize.cpp revision 462d1ca42831df53a3c6435d247776f0b85bd594
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11// and generates target-independent LLVM-IR. Legalization of the IR is done 12// in the codegen. However, the vectorizes uses (will use) the codegen 13// interfaces to generate IR that is likely to result in an optimal binary. 14// 15// The loop vectorizer combines consecutive loop iteration into a single 16// 'wide' iteration. After this transformation the index is incremented 17// by the SIMD vector width, and not by one. 18// 19// This pass has three parts: 20// 1. The main loop pass that drives the different parts. 21// 2. LoopVectorizationLegality - A unit that checks for the legality 22// of the vectorization. 23// 3. SingleBlockLoopVectorizer - A unit that performs the actual 24// widening of instructions. 25// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26// of vectorization. It decides on the optimal vector width, which 27// can be one, if vectorization is not profitable. 28//===----------------------------------------------------------------------===// 29// 30// The reduction-variable vectorization is based on the paper: 31// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 32// 33// Variable uniformity checks are inspired by: 34// Karrenberg, R. and Hack, S. Whole Function Vectorization. 35// 36// Other ideas/concepts are from: 37// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 38// 39//===----------------------------------------------------------------------===// 40#define LV_NAME "loop-vectorize" 41#define DEBUG_TYPE LV_NAME 42#include "llvm/Constants.h" 43#include "llvm/DerivedTypes.h" 44#include "llvm/Instructions.h" 45#include "llvm/LLVMContext.h" 46#include "llvm/Pass.h" 47#include "llvm/Analysis/LoopPass.h" 48#include "llvm/Value.h" 49#include "llvm/Function.h" 50#include "llvm/Analysis/Verifier.h" 51#include "llvm/Module.h" 52#include "llvm/Type.h" 53#include "llvm/ADT/SmallVector.h" 54#include "llvm/ADT/StringExtras.h" 55#include "llvm/Analysis/AliasAnalysis.h" 56#include "llvm/Analysis/AliasSetTracker.h" 57#include "llvm/Analysis/ScalarEvolution.h" 58#include "llvm/Analysis/Dominators.h" 59#include "llvm/Analysis/ScalarEvolutionExpressions.h" 60#include "llvm/Analysis/ScalarEvolutionExpander.h" 61#include "llvm/Analysis/LoopInfo.h" 62#include "llvm/Analysis/ValueTracking.h" 63#include "llvm/Transforms/Scalar.h" 64#include "llvm/Transforms/Utils/BasicBlockUtils.h" 65#include "llvm/TargetTransformInfo.h" 66#include "llvm/Support/CommandLine.h" 67#include "llvm/Support/Debug.h" 68#include "llvm/Support/raw_ostream.h" 69#include "llvm/DataLayout.h" 70#include "llvm/Transforms/Utils/Local.h" 71#include <algorithm> 72using namespace llvm; 73 74static cl::opt<unsigned> 75VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 76 cl::desc("Set the default vectorization width. Zero is autoselect.")); 77 78namespace { 79 80// Forward declarations. 81class LoopVectorizationLegality; 82class LoopVectorizationCostModel; 83 84/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 85/// block to a specified vectorization factor (VF). 86/// This class performs the widening of scalars into vectors, or multiple 87/// scalars. This class also implements the following features: 88/// * It inserts an epilogue loop for handling loops that don't have iteration 89/// counts that are known to be a multiple of the vectorization factor. 90/// * It handles the code generation for reduction variables. 91/// * Scalarization (implementation using scalars) of un-vectorizable 92/// instructions. 93/// SingleBlockLoopVectorizer does not perform any vectorization-legality 94/// checks, and relies on the caller to check for the different legality 95/// aspects. The SingleBlockLoopVectorizer relies on the 96/// LoopVectorizationLegality class to provide information about the induction 97/// and reduction variables that were found to a given vectorization factor. 98class SingleBlockLoopVectorizer { 99public: 100 /// Ctor. 101 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 102 DominatorTree *dt, LPPassManager *Lpm, 103 unsigned VecWidth): 104 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), 105 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 106 107 // Perform the actual loop widening (vectorization). 108 void vectorize(LoopVectorizationLegality *Legal) { 109 ///Create a new empty loop. Unlink the old loop and connect the new one. 110 createEmptyLoop(Legal); 111 /// Widen each instruction in the old loop to a new one in the new loop. 112 /// Use the Legality module to find the induction and reduction variables. 113 vectorizeLoop(Legal); 114 // register the new loop. 115 updateAnalysis(); 116 } 117 118private: 119 /// Create an empty loop, based on the loop ranges of the old loop. 120 void createEmptyLoop(LoopVectorizationLegality *Legal); 121 /// Copy and widen the instructions from the old loop. 122 void vectorizeLoop(LoopVectorizationLegality *Legal); 123 /// Insert the new loop to the loop hierarchy and pass manager. 124 void updateAnalysis(); 125 126 /// This instruction is un-vectorizable. Implement it as a sequence 127 /// of scalars. 128 void scalarizeInstruction(Instruction *Instr); 129 130 /// Create a broadcast instruction. This method generates a broadcast 131 /// instruction (shuffle) for loop invariant values and for the induction 132 /// value. If this is the induction variable then we extend it to N, N+1, ... 133 /// this is needed because each iteration in the loop corresponds to a SIMD 134 /// element. 135 Value *getBroadcastInstrs(Value *V); 136 137 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 138 /// for each element in the vector. Starting from zero. 139 Value *getConsecutiveVector(Value* Val); 140 141 /// When we go over instructions in the basic block we rely on previous 142 /// values within the current basic block or on loop invariant values. 143 /// When we widen (vectorize) values we place them in the map. If the values 144 /// are not within the map, they have to be loop invariant, so we simply 145 /// broadcast them into a vector. 146 Value *getVectorValue(Value *V); 147 148 /// Get a uniform vector of constant integers. We use this to get 149 /// vectors of ones and zeros for the reduction code. 150 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 151 152 typedef DenseMap<Value*, Value*> ValueMap; 153 154 /// The original loop. 155 Loop *OrigLoop; 156 // Scev analysis to use. 157 ScalarEvolution *SE; 158 // Loop Info. 159 LoopInfo *LI; 160 // Dominator Tree. 161 DominatorTree *DT; 162 // Loop Pass Manager; 163 LPPassManager *LPM; 164 // The vectorization factor to use. 165 unsigned VF; 166 167 // The builder that we use 168 IRBuilder<> Builder; 169 170 // --- Vectorization state --- 171 172 /// The vector-loop preheader. 173 BasicBlock *LoopVectorPreHeader; 174 /// The scalar-loop preheader. 175 BasicBlock *LoopScalarPreHeader; 176 /// Middle Block between the vector and the scalar. 177 BasicBlock *LoopMiddleBlock; 178 ///The ExitBlock of the scalar loop. 179 BasicBlock *LoopExitBlock; 180 ///The vector loop body. 181 BasicBlock *LoopVectorBody; 182 ///The scalar loop body. 183 BasicBlock *LoopScalarBody; 184 ///The first bypass block. 185 BasicBlock *LoopBypassBlock; 186 187 /// The new Induction variable which was added to the new block. 188 PHINode *Induction; 189 /// The induction variable of the old basic block. 190 PHINode *OldInduction; 191 // Maps scalars to widened vectors. 192 ValueMap WidenMap; 193}; 194 195/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 196/// to what vectorization factor. 197/// This class does not look at the profitability of vectorization, only the 198/// legality. This class has two main kinds of checks: 199/// * Memory checks - The code in canVectorizeMemory checks if vectorization 200/// will change the order of memory accesses in a way that will change the 201/// correctness of the program. 202/// * Scalars checks - The code in canVectorizeBlock checks for a number 203/// of different conditions, such as the availability of a single induction 204/// variable, that all types are supported and vectorize-able, etc. 205/// This code reflects the capabilities of SingleBlockLoopVectorizer. 206/// This class is also used by SingleBlockLoopVectorizer for identifying 207/// induction variable and the different reduction variables. 208class LoopVectorizationLegality { 209public: 210 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 211 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 212 213 /// This represents the kinds of reductions that we support. 214 enum ReductionKind { 215 NoReduction = -1, /// Not a reduction. 216 IntegerAdd = 0, /// Sum of numbers. 217 IntegerMult = 1, /// Product of numbers. 218 IntegerOr = 2, /// Bitwise or logical OR of numbers. 219 IntegerAnd = 3, /// Bitwise or logical AND of numbers. 220 IntegerXor = 4 /// Bitwise or logical XOR of numbers. 221 }; 222 223 /// This POD struct holds information about reduction variables. 224 struct ReductionDescriptor { 225 // Default C'tor 226 ReductionDescriptor(): 227 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 228 229 // C'tor. 230 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 231 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 232 233 // The starting value of the reduction. 234 // It does not have to be zero! 235 Value *StartValue; 236 // The instruction who's value is used outside the loop. 237 Instruction *LoopExitInstr; 238 // The kind of the reduction. 239 ReductionKind Kind; 240 }; 241 242 /// ReductionList contains the reduction descriptors for all 243 /// of the reductions that were found in the loop. 244 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 245 246 /// Returns true if it is legal to vectorize this loop. 247 /// This does not mean that it is profitable to vectorize this 248 /// loop, only that it is legal to do so. 249 bool canVectorize(); 250 251 /// Returns the Induction variable. 252 PHINode *getInduction() {return Induction;} 253 254 /// Returns the reduction variables found in the loop. 255 ReductionList *getReductionVars() { return &Reductions; } 256 257 /// Check if the pointer returned by this GEP is consecutive 258 /// when the index is vectorized. This happens when the last 259 /// index of the GEP is consecutive, like the induction variable. 260 /// This check allows us to vectorize A[idx] into a wide load/store. 261 bool isConsecutiveGep(Value *Ptr); 262 263 /// Returns true if this instruction will remain scalar after vectorization. 264 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 265 266private: 267 /// Check if a single basic block loop is vectorizable. 268 /// At this point we know that this is a loop with a constant trip count 269 /// and we only need to check individual instructions. 270 bool canVectorizeBlock(BasicBlock &BB); 271 272 /// When we vectorize loops we may change the order in which 273 /// we read and write from memory. This method checks if it is 274 /// legal to vectorize the code, considering only memory constrains. 275 /// Returns true if BB is vectorizable 276 bool canVectorizeMemory(BasicBlock &BB); 277 278 /// Returns True, if 'Phi' is the kind of reduction variable for type 279 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 280 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 281 /// Returns true if the instruction I can be a reduction variable of type 282 /// 'Kind'. 283 bool isReductionInstr(Instruction *I, ReductionKind Kind); 284 /// Returns True, if 'Phi' is an induction variable. 285 bool isInductionVariable(PHINode *Phi); 286 287 /// The loop that we evaluate. 288 Loop *TheLoop; 289 /// Scev analysis. 290 ScalarEvolution *SE; 291 /// DataLayout analysis. 292 DataLayout *DL; 293 294 // --- vectorization state --- // 295 296 /// Holds the induction variable. 297 PHINode *Induction; 298 /// Holds the reduction variables. 299 ReductionList Reductions; 300 /// Allowed outside users. This holds the reduction 301 /// vars which can be accessed from outside the loop. 302 SmallPtrSet<Value*, 4> AllowedExit; 303 /// This set holds the variables which are known to be uniform after 304 /// vectorization. 305 SmallPtrSet<Instruction*, 4> Uniforms; 306}; 307 308/// LoopVectorizationCostModel - estimates the expected speedups due to 309/// vectorization. 310/// In many cases vectorization is not profitable. This can happen because 311/// of a number of reasons. In this class we mainly attempt to predict 312/// the expected speedup/slowdowns due to the supported instruction set. 313/// We use the VectorTargetTransformInfo to query the different backends 314/// for the cost of different operations. 315class LoopVectorizationCostModel { 316public: 317 /// C'tor. 318 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 319 LoopVectorizationLegality *Leg, 320 const VectorTargetTransformInfo *Vtti): 321 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 322 323 /// Returns the most profitable vectorization factor for the loop that is 324 /// smaller or equal to the VF argument. This method checks every power 325 /// of two up to VF. 326 unsigned findBestVectorizationFactor(unsigned VF = 8); 327 328private: 329 /// Returns the expected execution cost. The unit of the cost does 330 /// not matter because we use the 'cost' units to compare different 331 /// vector widths. The cost that is returned is *not* normalized by 332 /// the factor width. 333 unsigned expectedCost(unsigned VF); 334 335 /// Returns the execution time cost of an instruction for a given vector 336 /// width. Vector width of one means scalar. 337 unsigned getInstructionCost(Instruction *I, unsigned VF); 338 339 /// A helper function for converting Scalar types to vector types. 340 /// If the incoming type is void, we return void. If the VF is 1, we return 341 /// the scalar type. 342 static Type* ToVectorTy(Type *Scalar, unsigned VF); 343 344 /// The loop that we evaluate. 345 Loop *TheLoop; 346 /// Scev analysis. 347 ScalarEvolution *SE; 348 349 /// Vectorization legality. 350 LoopVectorizationLegality *Legal; 351 /// Vector target information. 352 const VectorTargetTransformInfo *VTTI; 353}; 354 355struct LoopVectorize : public LoopPass { 356 static char ID; // Pass identification, replacement for typeid 357 358 LoopVectorize() : LoopPass(ID) { 359 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 360 } 361 362 ScalarEvolution *SE; 363 DataLayout *DL; 364 LoopInfo *LI; 365 TargetTransformInfo *TTI; 366 DominatorTree *DT; 367 368 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 369 // We only vectorize innermost loops. 370 if (!L->empty()) 371 return false; 372 373 SE = &getAnalysis<ScalarEvolution>(); 374 DL = getAnalysisIfAvailable<DataLayout>(); 375 LI = &getAnalysis<LoopInfo>(); 376 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 377 DT = &getAnalysis<DominatorTree>(); 378 379 DEBUG(dbgs() << "LV: Checking a loop in \"" << 380 L->getHeader()->getParent()->getName() << "\"\n"); 381 382 // Check if it is legal to vectorize the loop. 383 LoopVectorizationLegality LVL(L, SE, DL); 384 if (!LVL.canVectorize()) { 385 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 386 return false; 387 } 388 389 // Select the preffered vectorization factor. 390 unsigned VF = 1; 391 if (VectorizationFactor == 0) { 392 const VectorTargetTransformInfo *VTTI = 0; 393 if (TTI) 394 VTTI = TTI->getVectorTargetTransformInfo(); 395 // Use the cost model. 396 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 397 VF = CM.findBestVectorizationFactor(); 398 399 if (VF == 1) { 400 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 401 return false; 402 } 403 404 } else { 405 // Use the user command flag. 406 VF = VectorizationFactor; 407 } 408 409 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 410 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 411 "\n"); 412 413 // If we decided that it is *legal* to vectorizer the loop then do it. 414 SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); 415 LB.vectorize(&LVL); 416 417 DEBUG(verifyFunction(*L->getHeader()->getParent())); 418 return true; 419 } 420 421 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 422 LoopPass::getAnalysisUsage(AU); 423 AU.addRequiredID(LoopSimplifyID); 424 AU.addRequiredID(LCSSAID); 425 AU.addRequired<LoopInfo>(); 426 AU.addRequired<ScalarEvolution>(); 427 AU.addRequired<DominatorTree>(); 428 AU.addPreserved<LoopInfo>(); 429 AU.addPreserved<DominatorTree>(); 430 } 431 432}; 433 434Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 435 // Instructions that access the old induction variable 436 // actually want to get the new one. 437 if (V == OldInduction) 438 V = Induction; 439 // Create the types. 440 LLVMContext &C = V->getContext(); 441 Type *VTy = VectorType::get(V->getType(), VF); 442 Type *I32 = IntegerType::getInt32Ty(C); 443 Constant *Zero = ConstantInt::get(I32, 0); 444 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 445 Value *UndefVal = UndefValue::get(VTy); 446 // Insert the value into a new vector. 447 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 448 // Broadcast the scalar into all locations in the vector. 449 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 450 "broadcast"); 451 // We are accessing the induction variable. Make sure to promote the 452 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 453 if (V == Induction) 454 return getConsecutiveVector(Shuf); 455 return Shuf; 456} 457 458Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 459 assert(Val->getType()->isVectorTy() && "Must be a vector"); 460 assert(Val->getType()->getScalarType()->isIntegerTy() && 461 "Elem must be an integer"); 462 // Create the types. 463 Type *ITy = Val->getType()->getScalarType(); 464 VectorType *Ty = cast<VectorType>(Val->getType()); 465 unsigned VLen = Ty->getNumElements(); 466 SmallVector<Constant*, 8> Indices; 467 468 // Create a vector of consecutive numbers from zero to VF. 469 for (unsigned i = 0; i < VLen; ++i) 470 Indices.push_back(ConstantInt::get(ITy, i)); 471 472 // Add the consecutive indices to the vector value. 473 Constant *Cv = ConstantVector::get(Indices); 474 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 475 return Builder.CreateAdd(Val, Cv, "induction"); 476} 477 478bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { 479 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 480 if (!Gep) 481 return false; 482 483 unsigned NumOperands = Gep->getNumOperands(); 484 Value *LastIndex = Gep->getOperand(NumOperands - 1); 485 486 // Check that all of the gep indices are uniform except for the last. 487 for (unsigned i = 0; i < NumOperands - 1; ++i) 488 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 489 return false; 490 491 // We can emit wide load/stores only of the last index is the induction 492 // variable. 493 const SCEV *Last = SE->getSCEV(LastIndex); 494 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 495 const SCEV *Step = AR->getStepRecurrence(*SE); 496 497 // The memory is consecutive because the last index is consecutive 498 // and all other indices are loop invariant. 499 if (Step->isOne()) 500 return true; 501 } 502 503 return false; 504} 505 506Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 507 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 508 // If we saved a vectorized copy of V, use it. 509 Value *&MapEntry = WidenMap[V]; 510 if (MapEntry) 511 return MapEntry; 512 513 // Broadcast V and save the value for future uses. 514 Value *B = getBroadcastInstrs(V); 515 MapEntry = B; 516 return B; 517} 518 519Constant* 520SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 521 SmallVector<Constant*, 8> Indices; 522 // Create a vector of consecutive numbers from zero to VF. 523 for (unsigned i = 0; i < VF; ++i) 524 Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); 525 526 // Add the consecutive indices to the vector value. 527 return ConstantVector::get(Indices); 528} 529 530void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 531 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 532 // Holds vector parameters or scalars, in case of uniform vals. 533 SmallVector<Value*, 8> Params; 534 535 // Find all of the vectorized parameters. 536 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 537 Value *SrcOp = Instr->getOperand(op); 538 539 // If we are accessing the old induction variable, use the new one. 540 if (SrcOp == OldInduction) { 541 Params.push_back(getBroadcastInstrs(Induction)); 542 continue; 543 } 544 545 // Try using previously calculated values. 546 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 547 548 // If the src is an instruction that appeared earlier in the basic block 549 // then it should already be vectorized. 550 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 551 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 552 // The parameter is a vector value from earlier. 553 Params.push_back(WidenMap[SrcInst]); 554 } else { 555 // The parameter is a scalar from outside the loop. Maybe even a constant. 556 Params.push_back(SrcOp); 557 } 558 } 559 560 assert(Params.size() == Instr->getNumOperands() && 561 "Invalid number of operands"); 562 563 // Does this instruction return a value ? 564 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 565 Value *VecResults = 0; 566 567 // If we have a return value, create an empty vector. We place the scalarized 568 // instructions in this vector. 569 if (!IsVoidRetTy) 570 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 571 572 // For each scalar that we create: 573 for (unsigned i = 0; i < VF; ++i) { 574 Instruction *Cloned = Instr->clone(); 575 if (!IsVoidRetTy) 576 Cloned->setName(Instr->getName() + ".cloned"); 577 // Replace the operands of the cloned instrucions with extracted scalars. 578 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 579 Value *Op = Params[op]; 580 // Param is a vector. Need to extract the right lane. 581 if (Op->getType()->isVectorTy()) 582 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 583 Cloned->setOperand(op, Op); 584 } 585 586 // Place the cloned scalar in the new loop. 587 Builder.Insert(Cloned); 588 589 // If the original scalar returns a value we need to place it in a vector 590 // so that future users will be able to use it. 591 if (!IsVoidRetTy) 592 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 593 Builder.getInt32(i)); 594 } 595 596 if (!IsVoidRetTy) 597 WidenMap[Instr] = VecResults; 598} 599 600void 601SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 602 /* 603 In this function we generate a new loop. The new loop will contain 604 the vectorized instructions while the old loop will continue to run the 605 scalar remainder. 606 607 [ ] <-- vector loop bypass. 608 / | 609 / v 610| [ ] <-- vector pre header. 611| | 612| v 613| [ ] \ 614| [ ]_| <-- vector loop. 615| | 616 \ v 617 >[ ] <--- middle-block. 618 / | 619 / v 620| [ ] <--- new preheader. 621| | 622| v 623| [ ] \ 624| [ ]_| <-- old scalar loop to handle remainder. 625 \ | 626 \ v 627 >[ ] <-- exit block. 628 ... 629 */ 630 631 // This is the original scalar-loop preheader. 632 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 633 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 634 assert(ExitBlock && "Must have an exit block"); 635 636 // The loop index does not have to start at Zero. It starts with this value. 637 OldInduction = Legal->getInduction(); 638 Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); 639 640 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 641 assert(BypassBlock && "Invalid loop structure"); 642 643 BasicBlock *VectorPH = 644 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 645 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 646 "vector.body"); 647 648 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 649 "middle.block"); 650 BasicBlock *ScalarPH = 651 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 652 "scalar.preheader"); 653 // Find the induction variable. 654 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 655 assert(OldInduction && "We must have a single phi node."); 656 Type *IdxTy = OldInduction->getType(); 657 658 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 659 // inside the loop. 660 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 661 662 // Generate the induction variable. 663 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 664 Constant *Step = ConstantInt::get(IdxTy, VF); 665 666 // Find the loop boundaries. 667 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 668 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 669 670 // Get the total trip count from the count by adding 1. 671 ExitCount = SE->getAddExpr(ExitCount, 672 SE->getConstant(ExitCount->getType(), 1)); 673 674 // Expand the trip count and place the new instructions in the preheader. 675 // Notice that the pre-header does not change, only the loop body. 676 SCEVExpander Exp(*SE, "induction"); 677 Instruction *Loc = BypassBlock->getTerminator(); 678 679 // We may need to extend the index in case there is a type mismatch. 680 // We know that the count starts at zero and does not overflow. 681 // We are using Zext because it should be less expensive. 682 if (ExitCount->getType() != Induction->getType()) 683 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 684 685 // Count holds the overall loop count (N). 686 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 687 688 // Add the start index to the loop count to get the new end index. 689 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 690 691 // Now we need to generate the expression for N - (N % VF), which is 692 // the part that the vectorized body will execute. 693 Constant *CIVF = ConstantInt::get(IdxTy, VF); 694 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 695 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 696 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 697 "end.idx.rnd.down", Loc); 698 699 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 700 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 701 IdxEndRoundDown, 702 StartIdx, 703 "cmp.zero", Loc); 704 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 705 // Remove the old terminator. 706 Loc->eraseFromParent(); 707 708 // Add a check in the middle block to see if we have completed 709 // all of the iterations in the first vector loop. 710 // If (N - N%VF) == N, then we *don't* need to run the remainder. 711 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 712 IdxEndRoundDown, "cmp.n", 713 MiddleBlock->getTerminator()); 714 715 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 716 // Remove the old terminator. 717 MiddleBlock->getTerminator()->eraseFromParent(); 718 719 // Create i+1 and fill the PHINode. 720 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 721 Induction->addIncoming(StartIdx, VectorPH); 722 Induction->addIncoming(NextIdx, VecBody); 723 // Create the compare. 724 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 725 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 726 727 // Now we have two terminators. Remove the old one from the block. 728 VecBody->getTerminator()->eraseFromParent(); 729 730 // Fix the scalar body iteration count. 731 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 732 OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); 733 734 // Get ready to start creating new instructions into the vectorized body. 735 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 736 737 // Register the new loop. 738 Loop* Lp = new Loop(); 739 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 740 741 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 742 743 Loop *ParentLoop = OrigLoop->getParentLoop(); 744 if (ParentLoop) { 745 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 746 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 747 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 748 } 749 750 // Save the state. 751 LoopVectorPreHeader = VectorPH; 752 LoopScalarPreHeader = ScalarPH; 753 LoopMiddleBlock = MiddleBlock; 754 LoopExitBlock = ExitBlock; 755 LoopVectorBody = VecBody; 756 LoopScalarBody = OldBasicBlock; 757 LoopBypassBlock = BypassBlock; 758} 759 760/// This function returns the identity element (or neutral element) for 761/// the operation K. 762static unsigned 763getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 764 switch (K) { 765 case LoopVectorizationLegality::IntegerXor: 766 case LoopVectorizationLegality::IntegerAdd: 767 case LoopVectorizationLegality::IntegerOr: 768 // Adding, Xoring, Oring zero to a number does not change it. 769 return 0; 770 case LoopVectorizationLegality::IntegerMult: 771 // Multiplying a number by 1 does not change it. 772 return 1; 773 case LoopVectorizationLegality::IntegerAnd: 774 // AND-ing a number with an all-1 value does not change it. 775 return -1; 776 default: 777 llvm_unreachable("Unknown reduction kind"); 778 } 779} 780 781void 782SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 783 //===------------------------------------------------===// 784 // 785 // Notice: any optimization or new instruction that go 786 // into the code below should be also be implemented in 787 // the cost-model. 788 // 789 //===------------------------------------------------===// 790 typedef SmallVector<PHINode*, 4> PhiVector; 791 BasicBlock &BB = *OrigLoop->getHeader(); 792 Constant *Zero = ConstantInt::get( 793 IntegerType::getInt32Ty(BB.getContext()), 0); 794 795 // In order to support reduction variables we need to be able to vectorize 796 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 797 // steages. First, we create a new vector PHI node with no incoming edges. 798 // We use this value when we vectorize all of the instructions that use the 799 // PHI. Next, after all of the instructions in the block are complete we 800 // add the new incoming edges to the PHI. At this point all of the 801 // instructions in the basic block are vectorized, so we can use them to 802 // construct the PHI. 803 PhiVector PHIsToFix; 804 805 // For each instruction in the old loop. 806 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 807 Instruction *Inst = it; 808 809 switch (Inst->getOpcode()) { 810 case Instruction::Br: 811 // Nothing to do for PHIs and BR, since we already took care of the 812 // loop control flow instructions. 813 continue; 814 case Instruction::PHI:{ 815 PHINode* P = cast<PHINode>(Inst); 816 // Special handling for the induction var. 817 if (OldInduction == Inst) 818 continue; 819 // This is phase one of vectorizing PHIs. 820 // This has to be a reduction variable. 821 assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); 822 Type *VecTy = VectorType::get(Inst->getType(), VF); 823 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); 824 PHIsToFix.push_back(P); 825 continue; 826 } 827 case Instruction::Add: 828 case Instruction::FAdd: 829 case Instruction::Sub: 830 case Instruction::FSub: 831 case Instruction::Mul: 832 case Instruction::FMul: 833 case Instruction::UDiv: 834 case Instruction::SDiv: 835 case Instruction::FDiv: 836 case Instruction::URem: 837 case Instruction::SRem: 838 case Instruction::FRem: 839 case Instruction::Shl: 840 case Instruction::LShr: 841 case Instruction::AShr: 842 case Instruction::And: 843 case Instruction::Or: 844 case Instruction::Xor: { 845 // Just widen binops. 846 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 847 Value *A = getVectorValue(Inst->getOperand(0)); 848 Value *B = getVectorValue(Inst->getOperand(1)); 849 // Use this vector value for all users of the original instruction. 850 WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 851 break; 852 } 853 case Instruction::Select: { 854 // Widen selects. 855 // If the selector is loop invariant we can create a select 856 // instruction with a scalar condition. Otherwise, use vector-select. 857 Value *Cond = Inst->getOperand(0); 858 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 859 860 // The condition can be loop invariant but still defined inside the 861 // loop. This means that we can't just use the original 'cond' value. 862 // We have to take the 'vectorized' value and pick the first lane. 863 // Instcombine will make this a no-op. 864 Cond = getVectorValue(Cond); 865 if (InvariantCond) 866 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 867 868 Value *Op0 = getVectorValue(Inst->getOperand(1)); 869 Value *Op1 = getVectorValue(Inst->getOperand(2)); 870 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 871 break; 872 } 873 874 case Instruction::ICmp: 875 case Instruction::FCmp: { 876 // Widen compares. Generate vector compares. 877 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 878 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 879 Value *A = getVectorValue(Inst->getOperand(0)); 880 Value *B = getVectorValue(Inst->getOperand(1)); 881 if (FCmp) 882 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 883 else 884 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 885 break; 886 } 887 888 case Instruction::Store: { 889 // Attempt to issue a wide store. 890 StoreInst *SI = dyn_cast<StoreInst>(Inst); 891 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 892 Value *Ptr = SI->getPointerOperand(); 893 unsigned Alignment = SI->getAlignment(); 894 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 895 // This store does not use GEPs. 896 if (!Legal->isConsecutiveGep(Gep)) { 897 scalarizeInstruction(Inst); 898 break; 899 } 900 901 // The last index does not have to be the induction. It can be 902 // consecutive and be a function of the index. For example A[I+1]; 903 unsigned NumOperands = Gep->getNumOperands(); 904 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 905 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 906 907 // Create the new GEP with the new induction variable. 908 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 909 Gep2->setOperand(NumOperands - 1, LastIndex); 910 Ptr = Builder.Insert(Gep2); 911 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 912 Value *Val = getVectorValue(SI->getValueOperand()); 913 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 914 break; 915 } 916 case Instruction::Load: { 917 // Attempt to issue a wide load. 918 LoadInst *LI = dyn_cast<LoadInst>(Inst); 919 Type *RetTy = VectorType::get(LI->getType(), VF); 920 Value *Ptr = LI->getPointerOperand(); 921 unsigned Alignment = LI->getAlignment(); 922 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 923 924 // We don't have a gep. Scalarize the load. 925 if (!Legal->isConsecutiveGep(Gep)) { 926 scalarizeInstruction(Inst); 927 break; 928 } 929 930 // The last index does not have to be the induction. It can be 931 // consecutive and be a function of the index. For example A[I+1]; 932 unsigned NumOperands = Gep->getNumOperands(); 933 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 934 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 935 936 // Create the new GEP with the new induction variable. 937 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 938 Gep2->setOperand(NumOperands - 1, LastIndex); 939 Ptr = Builder.Insert(Gep2); 940 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 941 LI = Builder.CreateLoad(Ptr); 942 LI->setAlignment(Alignment); 943 // Use this vector value for all users of the load. 944 WidenMap[Inst] = LI; 945 break; 946 } 947 case Instruction::ZExt: 948 case Instruction::SExt: 949 case Instruction::FPToUI: 950 case Instruction::FPToSI: 951 case Instruction::FPExt: 952 case Instruction::PtrToInt: 953 case Instruction::IntToPtr: 954 case Instruction::SIToFP: 955 case Instruction::UIToFP: 956 case Instruction::Trunc: 957 case Instruction::FPTrunc: 958 case Instruction::BitCast: { 959 /// Vectorize bitcasts. 960 CastInst *CI = dyn_cast<CastInst>(Inst); 961 Value *A = getVectorValue(Inst->getOperand(0)); 962 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 963 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 964 break; 965 } 966 967 default: 968 /// All other instructions are unsupported. Scalarize them. 969 scalarizeInstruction(Inst); 970 break; 971 }// end of switch. 972 }// end of for_each instr. 973 974 // At this point every instruction in the original loop is widended to 975 // a vector form. We are almost done. Now, we need to fix the PHI nodes 976 // that we vectorized. The PHI nodes are currently empty because we did 977 // not want to introduce cycles. Notice that the remaining PHI nodes 978 // that we need to fix are reduction variables. 979 980 // Create the 'reduced' values for each of the induction vars. 981 // The reduced values are the vector values that we scalarize and combine 982 // after the loop is finished. 983 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); 984 it != e; ++it) { 985 PHINode *RdxPhi = *it; 986 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 987 assert(RdxPhi && "Unable to recover vectorized PHI"); 988 989 // Find the reduction variable descriptor. 990 assert(Legal->getReductionVars()->count(RdxPhi) && 991 "Unable to find the reduction variable"); 992 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 993 (*Legal->getReductionVars())[RdxPhi]; 994 995 // We need to generate a reduction vector from the incoming scalar. 996 // To do so, we need to generate the 'identity' vector and overide 997 // one of the elements with the incoming scalar reduction. We need 998 // to do it in the vector-loop preheader. 999 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1000 1001 // This is the vector-clone of the value that leaves the loop. 1002 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1003 Type *VecTy = VectorExit->getType(); 1004 1005 // Find the reduction identity variable. Zero for addition, or, xor, 1006 // one for multiplication, -1 for And. 1007 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1008 VecTy->getScalarType()); 1009 1010 // This vector is the Identity vector where the first element is the 1011 // incoming scalar reduction. 1012 Value *VectorStart = Builder.CreateInsertElement(Identity, 1013 RdxDesc.StartValue, Zero); 1014 1015 1016 // Fix the vector-loop phi. 1017 // We created the induction variable so we know that the 1018 // preheader is the first entry. 1019 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1020 1021 // Reductions do not have to start at zero. They can start with 1022 // any loop invariant values. 1023 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1024 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1025 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1026 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1027 1028 // Before each round, move the insertion point right between 1029 // the PHIs and the values we are going to write. 1030 // This allows us to write both PHINodes and the extractelement 1031 // instructions. 1032 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1033 1034 // This PHINode contains the vectorized reduction variable, or 1035 // the initial value vector, if we bypass the vector loop. 1036 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1037 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1038 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1039 1040 // Extract the first scalar. 1041 Value *Scalar0 = 1042 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1043 // Extract and reduce the remaining vector elements. 1044 for (unsigned i=1; i < VF; ++i) { 1045 Value *Scalar1 = 1046 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1047 switch (RdxDesc.Kind) { 1048 case LoopVectorizationLegality::IntegerAdd: 1049 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1050 break; 1051 case LoopVectorizationLegality::IntegerMult: 1052 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1053 break; 1054 case LoopVectorizationLegality::IntegerOr: 1055 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1056 break; 1057 case LoopVectorizationLegality::IntegerAnd: 1058 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1059 break; 1060 case LoopVectorizationLegality::IntegerXor: 1061 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1062 break; 1063 default: 1064 llvm_unreachable("Unknown reduction operation"); 1065 } 1066 } 1067 1068 // Now, we need to fix the users of the reduction variable 1069 // inside and outside of the scalar remainder loop. 1070 // We know that the loop is in LCSSA form. We need to update the 1071 // PHI nodes in the exit blocks. 1072 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1073 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1074 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1075 if (!LCSSAPhi) continue; 1076 1077 // All PHINodes need to have a single entry edge, or two if 1078 // we already fixed them. 1079 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1080 1081 // We found our reduction value exit-PHI. Update it with the 1082 // incoming bypass edge. 1083 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1084 // Add an edge coming from the bypass. 1085 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1086 break; 1087 } 1088 }// end of the LCSSA phi scan. 1089 1090 // Fix the scalar loop reduction variable with the incoming reduction sum 1091 // from the vector body and from the backedge value. 1092 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1093 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1094 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1095 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1096 }// end of for each redux variable. 1097} 1098 1099void SingleBlockLoopVectorizer::updateAnalysis() { 1100 // The original basic block. 1101 SE->forgetLoop(OrigLoop); 1102 1103 // Update the dominator tree information. 1104 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1105 "Entry does not dominate exit."); 1106 1107 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1108 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1109 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1110 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1111 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1112 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1113 1114 DEBUG(DT->verifyAnalysis()); 1115} 1116 1117bool LoopVectorizationLegality::canVectorize() { 1118 if (!TheLoop->getLoopPreheader()) { 1119 assert(false && "No preheader!!"); 1120 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1121 return false; 1122 } 1123 1124 // We can only vectorize single basic block loops. 1125 unsigned NumBlocks = TheLoop->getNumBlocks(); 1126 if (NumBlocks != 1) { 1127 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1128 return false; 1129 } 1130 1131 // We need to have a loop header. 1132 BasicBlock *BB = TheLoop->getHeader(); 1133 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1134 1135 // Go over each instruction and look at memory deps. 1136 if (!canVectorizeBlock(*BB)) { 1137 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1138 return false; 1139 } 1140 1141 // ScalarEvolution needs to be able to find the exit count. 1142 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1143 if (ExitCount == SE->getCouldNotCompute()) { 1144 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1145 return false; 1146 } 1147 1148 DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); 1149 1150 // Okay! We can vectorize. At this point we don't have any other mem analysis 1151 // which may limit our maximum vectorization factor, so just return true with 1152 // no restrictions. 1153 return true; 1154} 1155 1156bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1157 // Scan the instructions in the block and look for hazards. 1158 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1159 Instruction *I = it; 1160 1161 PHINode *Phi = dyn_cast<PHINode>(I); 1162 if (Phi) { 1163 // This should not happen because the loop should be normalized. 1164 if (Phi->getNumIncomingValues() != 2) { 1165 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1166 return false; 1167 } 1168 // We only look at integer phi nodes. 1169 if (!Phi->getType()->isIntegerTy()) { 1170 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 1171 return false; 1172 } 1173 1174 if (isInductionVariable(Phi)) { 1175 if (Induction) { 1176 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1177 return false; 1178 } 1179 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1180 Induction = Phi; 1181 continue; 1182 } 1183 if (AddReductionVar(Phi, IntegerAdd)) { 1184 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1185 continue; 1186 } 1187 if (AddReductionVar(Phi, IntegerMult)) { 1188 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1189 continue; 1190 } 1191 if (AddReductionVar(Phi, IntegerOr)) { 1192 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1193 continue; 1194 } 1195 if (AddReductionVar(Phi, IntegerAnd)) { 1196 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1197 continue; 1198 } 1199 if (AddReductionVar(Phi, IntegerXor)) { 1200 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1201 continue; 1202 } 1203 1204 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1205 return false; 1206 }// end of PHI handling 1207 1208 // We still don't handle functions. 1209 CallInst *CI = dyn_cast<CallInst>(I); 1210 if (CI) { 1211 DEBUG(dbgs() << "LV: Found a call site.\n"); 1212 return false; 1213 } 1214 1215 // We do not re-vectorize vectors. 1216 if (!VectorType::isValidElementType(I->getType()) && 1217 !I->getType()->isVoidTy()) { 1218 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1219 return false; 1220 } 1221 1222 // Reduction instructions are allowed to have exit users. 1223 // All other instructions must not have external users. 1224 if (!AllowedExit.count(I)) 1225 //Check that all of the users of the loop are inside the BB. 1226 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1227 it != e; ++it) { 1228 Instruction *U = cast<Instruction>(*it); 1229 // This user may be a reduction exit value. 1230 BasicBlock *Parent = U->getParent(); 1231 if (Parent != &BB) { 1232 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1233 return false; 1234 } 1235 } 1236 } // next instr. 1237 1238 if (!Induction) { 1239 DEBUG(dbgs() << "LV: Did not find an induction var.\n"); 1240 return false; 1241 } 1242 1243 // Don't vectorize if the memory dependencies do not allow vectorization. 1244 if (!canVectorizeMemory(BB)) 1245 return false; 1246 1247 // We now know that the loop is vectorizable! 1248 // Collect variables that will remain uniform after vectorization. 1249 std::vector<Value*> Worklist; 1250 1251 // Start with the conditional branch and walk up the block. 1252 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1253 1254 while (Worklist.size()) { 1255 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1256 Worklist.pop_back(); 1257 // Look at instructions inside this block. 1258 if (!I) continue; 1259 if (I->getParent() != &BB) continue; 1260 1261 // Stop when reaching PHI nodes. 1262 if (isa<PHINode>(I)) { 1263 assert(I == Induction && "Found a uniform PHI that is not the induction"); 1264 break; 1265 } 1266 1267 // This is a known uniform. 1268 Uniforms.insert(I); 1269 1270 // Insert all operands. 1271 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1272 Worklist.push_back(I->getOperand(i)); 1273 } 1274 } 1275 1276 return true; 1277} 1278 1279bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1280 typedef SmallVector<Value*, 16> ValueVector; 1281 typedef SmallPtrSet<Value*, 16> ValueSet; 1282 // Holds the Load and Store *instructions*. 1283 ValueVector Loads; 1284 ValueVector Stores; 1285 1286 // Scan the BB and collect legal loads and stores. 1287 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1288 Instruction *I = it; 1289 1290 // If this is a load, save it. If this instruction can read from memory 1291 // but is not a load, then we quit. Notice that we don't handle function 1292 // calls that read or write. 1293 if (I->mayReadFromMemory()) { 1294 LoadInst *Ld = dyn_cast<LoadInst>(I); 1295 if (!Ld) return false; 1296 if (!Ld->isSimple()) { 1297 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1298 return false; 1299 } 1300 Loads.push_back(Ld); 1301 continue; 1302 } 1303 1304 // Save store instructions. Abort if other instructions write to memory. 1305 if (I->mayWriteToMemory()) { 1306 StoreInst *St = dyn_cast<StoreInst>(I); 1307 if (!St) return false; 1308 if (!St->isSimple()) { 1309 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1310 return false; 1311 } 1312 Stores.push_back(St); 1313 } 1314 } // next instr. 1315 1316 // Now we have two lists that hold the loads and the stores. 1317 // Next, we find the pointers that they use. 1318 1319 // Check if we see any stores. If there are no stores, then we don't 1320 // care if the pointers are *restrict*. 1321 if (!Stores.size()) { 1322 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1323 return true; 1324 } 1325 1326 // Holds the read and read-write *pointers* that we find. 1327 ValueVector Reads; 1328 ValueVector ReadWrites; 1329 1330 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1331 // multiple times on the same object. If the ptr is accessed twice, once 1332 // for read and once for write, it will only appear once (on the write 1333 // list). This is okay, since we are going to check for conflicts between 1334 // writes and between reads and writes, but not between reads and reads. 1335 ValueSet Seen; 1336 1337 ValueVector::iterator I, IE; 1338 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1339 StoreInst *ST = dyn_cast<StoreInst>(*I); 1340 assert(ST && "Bad StoreInst"); 1341 Value* Ptr = ST->getPointerOperand(); 1342 // If we did *not* see this pointer before, insert it to 1343 // the read-write list. At this phase it is only a 'write' list. 1344 if (Seen.insert(Ptr)) 1345 ReadWrites.push_back(Ptr); 1346 } 1347 1348 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1349 LoadInst *LD = dyn_cast<LoadInst>(*I); 1350 assert(LD && "Bad LoadInst"); 1351 Value* Ptr = LD->getPointerOperand(); 1352 // If we did *not* see this pointer before, insert it to the 1353 // read list. If we *did* see it before, then it is already in 1354 // the read-write list. This allows us to vectorize expressions 1355 // such as A[i] += x; Because the address of A[i] is a read-write 1356 // pointer. This only works if the index of A[i] is consecutive. 1357 // If the address of i is unknown (for example A[B[i]]) then we may 1358 // read a few words, modify, and write a few words, and some of the 1359 // words may be written to the same address. 1360 if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) 1361 Reads.push_back(Ptr); 1362 } 1363 1364 // If we write (or read-write) to a single destination and there are no 1365 // other reads in this loop then is it safe to vectorize. 1366 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1367 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1368 return true; 1369 } 1370 1371 // Now that the pointers are in two lists (Reads and ReadWrites), we 1372 // can check that there are no conflicts between each of the writes and 1373 // between the writes to the reads. 1374 ValueSet WriteObjects; 1375 ValueVector TempObjects; 1376 1377 // Check that the read-writes do not conflict with other read-write 1378 // pointers. 1379 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1380 GetUnderlyingObjects(*I, TempObjects, DL); 1381 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1382 it != e; ++it) { 1383 if (!isIdentifiedObject(*it)) { 1384 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1385 return false; 1386 } 1387 if (!WriteObjects.insert(*it)) { 1388 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1389 << **it <<"\n"); 1390 return false; 1391 } 1392 } 1393 TempObjects.clear(); 1394 } 1395 1396 /// Check that the reads don't conflict with the read-writes. 1397 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1398 GetUnderlyingObjects(*I, TempObjects, DL); 1399 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1400 it != e; ++it) { 1401 if (!isIdentifiedObject(*it)) { 1402 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1403 return false; 1404 } 1405 if (WriteObjects.count(*it)) { 1406 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1407 << **it <<"\n"); 1408 return false; 1409 } 1410 } 1411 TempObjects.clear(); 1412 } 1413 1414 // All is okay. 1415 return true; 1416} 1417 1418bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1419 ReductionKind Kind) { 1420 if (Phi->getNumIncomingValues() != 2) 1421 return false; 1422 1423 // Find the possible incoming reduction variable. 1424 BasicBlock *BB = Phi->getParent(); 1425 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1426 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1427 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1428 1429 // ExitInstruction is the single value which is used outside the loop. 1430 // We only allow for a single reduction value to be used outside the loop. 1431 // This includes users of the reduction, variables (which form a cycle 1432 // which ends in the phi node). 1433 Instruction *ExitInstruction = 0; 1434 1435 // Iter is our iterator. We start with the PHI node and scan for all of the 1436 // users of this instruction. All users must be instructions which can be 1437 // used as reduction variables (such as ADD). We may have a single 1438 // out-of-block user. They cycle must end with the original PHI. 1439 // Also, we can't have multiple block-local users. 1440 Instruction *Iter = Phi; 1441 while (true) { 1442 // Any reduction instr must be of one of the allowed kinds. 1443 if (!isReductionInstr(Iter, Kind)) 1444 return false; 1445 1446 // Did we found a user inside this block ? 1447 bool FoundInBlockUser = false; 1448 // Did we reach the initial PHI node ? 1449 bool FoundStartPHI = false; 1450 1451 // If the instruction has no users then this is a broken 1452 // chain and can't be a reduction variable. 1453 if (Iter->use_empty()) 1454 return false; 1455 1456 // For each of the *users* of iter. 1457 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1458 it != e; ++it) { 1459 Instruction *U = cast<Instruction>(*it); 1460 // We already know that the PHI is a user. 1461 if (U == Phi) { 1462 FoundStartPHI = true; 1463 continue; 1464 } 1465 // Check if we found the exit user. 1466 BasicBlock *Parent = U->getParent(); 1467 if (Parent != BB) { 1468 // We must have a single exit instruction. 1469 if (ExitInstruction != 0) 1470 return false; 1471 ExitInstruction = Iter; 1472 } 1473 // We can't have multiple inside users. 1474 if (FoundInBlockUser) 1475 return false; 1476 FoundInBlockUser = true; 1477 Iter = U; 1478 } 1479 1480 // We found a reduction var if we have reached the original 1481 // phi node and we only have a single instruction with out-of-loop 1482 // users. 1483 if (FoundStartPHI && ExitInstruction) { 1484 // This instruction is allowed to have out-of-loop users. 1485 AllowedExit.insert(ExitInstruction); 1486 1487 // Save the description of this reduction variable. 1488 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1489 Reductions[Phi] = RD; 1490 return true; 1491 } 1492 } 1493} 1494 1495bool 1496LoopVectorizationLegality::isReductionInstr(Instruction *I, 1497 ReductionKind Kind) { 1498 switch (I->getOpcode()) { 1499 default: 1500 return false; 1501 case Instruction::PHI: 1502 // possibly. 1503 return true; 1504 case Instruction::Add: 1505 case Instruction::Sub: 1506 return Kind == IntegerAdd; 1507 case Instruction::Mul: 1508 case Instruction::UDiv: 1509 case Instruction::SDiv: 1510 return Kind == IntegerMult; 1511 case Instruction::And: 1512 return Kind == IntegerAnd; 1513 case Instruction::Or: 1514 return Kind == IntegerOr; 1515 case Instruction::Xor: 1516 return Kind == IntegerXor; 1517 } 1518} 1519 1520bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1521 // Check that the PHI is consecutive and starts at zero. 1522 const SCEV *PhiScev = SE->getSCEV(Phi); 1523 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1524 if (!AR) { 1525 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1526 return false; 1527 } 1528 const SCEV *Step = AR->getStepRecurrence(*SE); 1529 1530 if (!Step->isOne()) { 1531 DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); 1532 return false; 1533 } 1534 return true; 1535} 1536 1537unsigned 1538LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1539 if (!VTTI) { 1540 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1541 return 1; 1542 } 1543 1544 float Cost = expectedCost(1); 1545 unsigned Width = 1; 1546 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1547 for (unsigned i=2; i <= VF; i*=2) { 1548 // Notice that the vector loop needs to be executed less times, so 1549 // we need to divide the cost of the vector loops by the width of 1550 // the vector elements. 1551 float VectorCost = expectedCost(i) / (float)i; 1552 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1553 (int)VectorCost << ".\n"); 1554 if (VectorCost < Cost) { 1555 Cost = VectorCost; 1556 Width = i; 1557 } 1558 } 1559 1560 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1561 return Width; 1562} 1563 1564unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1565 // We can only estimate the cost of single basic block loops. 1566 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1567 1568 BasicBlock *BB = TheLoop->getHeader(); 1569 unsigned Cost = 0; 1570 1571 // For each instruction in the old loop. 1572 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1573 Instruction *Inst = it; 1574 unsigned C = getInstructionCost(Inst, VF); 1575 Cost += C; 1576 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1577 " For instruction: "<< *Inst << "\n"); 1578 } 1579 1580 return Cost; 1581} 1582 1583unsigned 1584LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1585 assert(VTTI && "Invalid vector target transformation info"); 1586 1587 // If we know that this instruction will remain uniform, check the cost of 1588 // the scalar version. 1589 if (Legal->isUniformAfterVectorization(I)) 1590 VF = 1; 1591 1592 Type *RetTy = I->getType(); 1593 Type *VectorTy = ToVectorTy(RetTy, VF); 1594 1595 1596 // TODO: We need to estimate the cost of intrinsic calls. 1597 switch (I->getOpcode()) { 1598 case Instruction::GetElementPtr: 1599 // We mark this instruction as zero-cost because scalar GEPs are usually 1600 // lowered to the intruction addressing mode. At the moment we don't 1601 // generate vector geps. 1602 return 0; 1603 case Instruction::Br: { 1604 return VTTI->getCFInstrCost(I->getOpcode()); 1605 } 1606 case Instruction::PHI: 1607 return 0; 1608 case Instruction::Add: 1609 case Instruction::FAdd: 1610 case Instruction::Sub: 1611 case Instruction::FSub: 1612 case Instruction::Mul: 1613 case Instruction::FMul: 1614 case Instruction::UDiv: 1615 case Instruction::SDiv: 1616 case Instruction::FDiv: 1617 case Instruction::URem: 1618 case Instruction::SRem: 1619 case Instruction::FRem: 1620 case Instruction::Shl: 1621 case Instruction::LShr: 1622 case Instruction::AShr: 1623 case Instruction::And: 1624 case Instruction::Or: 1625 case Instruction::Xor: { 1626 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1627 } 1628 case Instruction::Select: { 1629 SelectInst *SI = cast<SelectInst>(I); 1630 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1631 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1632 Type *CondTy = SI->getCondition()->getType(); 1633 if (ScalarCond) 1634 CondTy = VectorType::get(CondTy, VF); 1635 1636 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1637 } 1638 case Instruction::ICmp: 1639 case Instruction::FCmp: { 1640 Type *ValTy = I->getOperand(0)->getType(); 1641 VectorTy = ToVectorTy(ValTy, VF); 1642 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1643 } 1644 case Instruction::Store: { 1645 StoreInst *SI = cast<StoreInst>(I); 1646 Type *ValTy = SI->getValueOperand()->getType(); 1647 VectorTy = ToVectorTy(ValTy, VF); 1648 1649 if (VF == 1) 1650 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1651 SI->getAlignment(), SI->getPointerAddressSpace()); 1652 1653 // Scalarized stores. 1654 if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { 1655 unsigned Cost = 0; 1656 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1657 ValTy); 1658 // The cost of extracting from the value vector. 1659 Cost += VF * (ExtCost); 1660 // The cost of the scalar stores. 1661 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1662 ValTy->getScalarType(), 1663 SI->getAlignment(), 1664 SI->getPointerAddressSpace()); 1665 return Cost; 1666 } 1667 1668 // Wide stores. 1669 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 1670 SI->getPointerAddressSpace()); 1671 } 1672 case Instruction::Load: { 1673 LoadInst *LI = cast<LoadInst>(I); 1674 1675 if (VF == 1) 1676 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 1677 LI->getAlignment(), 1678 LI->getPointerAddressSpace()); 1679 1680 // Scalarized loads. 1681 if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { 1682 unsigned Cost = 0; 1683 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 1684 // The cost of inserting the loaded value into the result vector. 1685 Cost += VF * (InCost); 1686 // The cost of the scalar stores. 1687 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1688 RetTy->getScalarType(), 1689 LI->getAlignment(), 1690 LI->getPointerAddressSpace()); 1691 return Cost; 1692 } 1693 1694 // Wide loads. 1695 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 1696 LI->getPointerAddressSpace()); 1697 } 1698 case Instruction::ZExt: 1699 case Instruction::SExt: 1700 case Instruction::FPToUI: 1701 case Instruction::FPToSI: 1702 case Instruction::FPExt: 1703 case Instruction::PtrToInt: 1704 case Instruction::IntToPtr: 1705 case Instruction::SIToFP: 1706 case Instruction::UIToFP: 1707 case Instruction::Trunc: 1708 case Instruction::FPTrunc: 1709 case Instruction::BitCast: { 1710 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 1711 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 1712 } 1713 default: { 1714 // We are scalarizing the instruction. Return the cost of the scalar 1715 // instruction, plus the cost of insert and extract into vector 1716 // elements, times the vector width. 1717 unsigned Cost = 0; 1718 1719 bool IsVoid = RetTy->isVoidTy(); 1720 1721 unsigned InsCost = (IsVoid ? 0 : 1722 VTTI->getInstrCost(Instruction::InsertElement, 1723 VectorTy)); 1724 1725 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1726 VectorTy); 1727 1728 // The cost of inserting the results plus extracting each one of the 1729 // operands. 1730 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 1731 1732 // The cost of executing VF copies of the scalar instruction. 1733 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 1734 return Cost; 1735 } 1736 }// end of switch. 1737} 1738 1739Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 1740 if (Scalar->isVoidTy() || VF == 1) 1741 return Scalar; 1742 return VectorType::get(Scalar, VF); 1743} 1744 1745} // namespace 1746 1747char LoopVectorize::ID = 0; 1748static const char lv_name[] = "Loop Vectorization"; 1749INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 1750INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1751INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1752INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1753INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 1754 1755namespace llvm { 1756 Pass *createLoopVectorizePass() { 1757 return new LoopVectorize(); 1758 } 1759} 1760 1761