LoopVectorize.cpp revision be73c7b903647221fbcaae302d31e90f53583040
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9#include "LoopVectorize.h" 10#include "llvm/ADT/SmallSet.h" 11#include "llvm/ADT/StringExtras.h" 12#include "llvm/Analysis/AliasAnalysis.h" 13#include "llvm/Analysis/AliasSetTracker.h" 14#include "llvm/Analysis/Dominators.h" 15#include "llvm/Analysis/LoopInfo.h" 16#include "llvm/Analysis/LoopIterator.h" 17#include "llvm/Analysis/LoopPass.h" 18#include "llvm/Analysis/ScalarEvolutionExpander.h" 19#include "llvm/Analysis/ScalarEvolutionExpressions.h" 20#include "llvm/Analysis/ValueTracking.h" 21#include "llvm/Analysis/Verifier.h" 22#include "llvm/IR/Constants.h" 23#include "llvm/IR/DataLayout.h" 24#include "llvm/IR/DerivedTypes.h" 25#include "llvm/IR/Function.h" 26#include "llvm/IR/Instructions.h" 27#include "llvm/IR/IntrinsicInst.h" 28#include "llvm/IR/LLVMContext.h" 29#include "llvm/IR/Module.h" 30#include "llvm/IR/Type.h" 31#include "llvm/IR/Value.h" 32#include "llvm/Pass.h" 33#include "llvm/Support/CommandLine.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Support/raw_ostream.h" 36#include "llvm/TargetTransformInfo.h" 37#include "llvm/Transforms/Scalar.h" 38#include "llvm/Transforms/Utils/BasicBlockUtils.h" 39#include "llvm/Transforms/Utils/Local.h" 40#include "llvm/Transforms/Vectorize.h" 41 42static cl::opt<unsigned> 43VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 44 cl::desc("Sets the SIMD width. Zero is autoselect.")); 45 46static cl::opt<unsigned> 47VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, 48 cl::desc("Sets the vectorization unroll count. " 49 "Zero is autoselect.")); 50 51static cl::opt<bool> 52EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 53 cl::desc("Enable if-conversion during vectorization.")); 54 55namespace { 56 57/// The LoopVectorize Pass. 58struct LoopVectorize : public LoopPass { 59 /// Pass identification, replacement for typeid 60 static char ID; 61 62 explicit LoopVectorize() : LoopPass(ID) { 63 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 64 } 65 66 ScalarEvolution *SE; 67 DataLayout *DL; 68 LoopInfo *LI; 69 TargetTransformInfo *TTI; 70 DominatorTree *DT; 71 72 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 73 // We only vectorize innermost loops. 74 if (!L->empty()) 75 return false; 76 77 SE = &getAnalysis<ScalarEvolution>(); 78 DL = getAnalysisIfAvailable<DataLayout>(); 79 LI = &getAnalysis<LoopInfo>(); 80 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 81 DT = &getAnalysis<DominatorTree>(); 82 83 DEBUG(dbgs() << "LV: Checking a loop in \"" << 84 L->getHeader()->getParent()->getName() << "\"\n"); 85 86 // Check if it is legal to vectorize the loop. 87 LoopVectorizationLegality LVL(L, SE, DL, DT); 88 if (!LVL.canVectorize()) { 89 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 90 return false; 91 } 92 93 // Use the cost model. 94 LoopVectorizationCostModel CM(L, SE, LI, &LVL, TTI); 95 96 // Check the function attribues to find out if this function should be 97 // optimized for size. 98 Function *F = L->getHeader()->getParent(); 99 Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; 100 Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; 101 unsigned FnIndex = AttributeSet::FunctionIndex; 102 bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); 103 bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); 104 105 if (NoFloat) { 106 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 107 "attribute is used.\n"); 108 return false; 109 } 110 111 unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 112 unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); 113 114 if (VF == 1) { 115 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 116 return false; 117 } 118 119 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 120 F->getParent()->getModuleIdentifier()<<"\n"); 121 DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); 122 123 // If we decided that it is *legal* to vectorizer the loop then do it. 124 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); 125 LB.vectorize(&LVL); 126 127 DEBUG(verifyFunction(*L->getHeader()->getParent())); 128 return true; 129 } 130 131 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 132 LoopPass::getAnalysisUsage(AU); 133 AU.addRequiredID(LoopSimplifyID); 134 AU.addRequiredID(LCSSAID); 135 AU.addRequired<LoopInfo>(); 136 AU.addRequired<ScalarEvolution>(); 137 AU.addRequired<DominatorTree>(); 138 AU.addPreserved<LoopInfo>(); 139 AU.addPreserved<DominatorTree>(); 140 } 141 142}; 143 144}// namespace 145 146//===----------------------------------------------------------------------===// 147// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 148// LoopVectorizationCostModel. 149//===----------------------------------------------------------------------===// 150 151void 152LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 153 Loop *Lp, Value *Ptr) { 154 const SCEV *Sc = SE->getSCEV(Ptr); 155 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 156 assert(AR && "Invalid addrec expression"); 157 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 158 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 159 Pointers.push_back(Ptr); 160 Starts.push_back(AR->getStart()); 161 Ends.push_back(ScEnd); 162} 163 164Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 165 // Save the current insertion location. 166 Instruction *Loc = Builder.GetInsertPoint(); 167 168 // We need to place the broadcast of invariant variables outside the loop. 169 Instruction *Instr = dyn_cast<Instruction>(V); 170 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 171 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 172 173 // Place the code for broadcasting invariant variables in the new preheader. 174 if (Invariant) 175 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 176 177 // Broadcast the scalar into all locations in the vector. 178 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 179 180 // Restore the builder insertion point. 181 if (Invariant) 182 Builder.SetInsertPoint(Loc); 183 184 return Shuf; 185} 186 187Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, 188 bool Negate) { 189 assert(Val->getType()->isVectorTy() && "Must be a vector"); 190 assert(Val->getType()->getScalarType()->isIntegerTy() && 191 "Elem must be an integer"); 192 // Create the types. 193 Type *ITy = Val->getType()->getScalarType(); 194 VectorType *Ty = cast<VectorType>(Val->getType()); 195 int VLen = Ty->getNumElements(); 196 SmallVector<Constant*, 8> Indices; 197 198 // Create a vector of consecutive numbers from zero to VF. 199 for (int i = 0; i < VLen; ++i) { 200 int Idx = Negate ? (-i): i; 201 Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); 202 } 203 204 // Add the consecutive indices to the vector value. 205 Constant *Cv = ConstantVector::get(Indices); 206 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 207 return Builder.CreateAdd(Val, Cv, "induction"); 208} 209 210int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 211 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 212 213 // If this value is a pointer induction variable we know it is consecutive. 214 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 215 if (Phi && Inductions.count(Phi)) { 216 InductionInfo II = Inductions[Phi]; 217 if (PtrInduction == II.IK) 218 return 1; 219 } 220 221 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 222 if (!Gep) 223 return 0; 224 225 unsigned NumOperands = Gep->getNumOperands(); 226 Value *LastIndex = Gep->getOperand(NumOperands - 1); 227 228 // Check that all of the gep indices are uniform except for the last. 229 for (unsigned i = 0; i < NumOperands - 1; ++i) 230 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 231 return 0; 232 233 // We can emit wide load/stores only if the last index is the induction 234 // variable. 235 const SCEV *Last = SE->getSCEV(LastIndex); 236 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 237 const SCEV *Step = AR->getStepRecurrence(*SE); 238 239 // The memory is consecutive because the last index is consecutive 240 // and all other indices are loop invariant. 241 if (Step->isOne()) 242 return 1; 243 if (Step->isAllOnesValue()) 244 return -1; 245 } 246 247 return 0; 248} 249 250bool LoopVectorizationLegality::isUniform(Value *V) { 251 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 252} 253 254InnerLoopVectorizer::VectorParts& 255InnerLoopVectorizer::getVectorValue(Value *V) { 256 assert(V != Induction && "The new induction variable should not be used."); 257 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 258 259 // If we have this scalar in the map, return it. 260 if (WidenMap.has(V)) 261 return WidenMap.get(V); 262 263 // If this scalar is unknown, assume that it is a constant or that it is 264 // loop invariant. Broadcast V and save the value for future uses. 265 Value *B = getBroadcastInstrs(V); 266 WidenMap.splat(V, B); 267 return WidenMap.get(V); 268} 269 270Constant* 271InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 272 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 273} 274 275Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 276 assert(Vec->getType()->isVectorTy() && "Invalid type"); 277 SmallVector<Constant*, 8> ShuffleMask; 278 for (unsigned i = 0; i < VF; ++i) 279 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 280 281 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 282 ConstantVector::get(ShuffleMask), 283 "reverse"); 284} 285 286void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 287 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 288 // Holds vector parameters or scalars, in case of uniform vals. 289 SmallVector<VectorParts, 4> Params; 290 291 // Find all of the vectorized parameters. 292 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 293 Value *SrcOp = Instr->getOperand(op); 294 295 // If we are accessing the old induction variable, use the new one. 296 if (SrcOp == OldInduction) { 297 Params.push_back(getVectorValue(SrcOp)); 298 continue; 299 } 300 301 // Try using previously calculated values. 302 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 303 304 // If the src is an instruction that appeared earlier in the basic block 305 // then it should already be vectorized. 306 if (SrcInst && OrigLoop->contains(SrcInst)) { 307 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 308 // The parameter is a vector value from earlier. 309 Params.push_back(WidenMap.get(SrcInst)); 310 } else { 311 // The parameter is a scalar from outside the loop. Maybe even a constant. 312 VectorParts Scalars; 313 Scalars.append(UF, SrcOp); 314 Params.push_back(Scalars); 315 } 316 } 317 318 assert(Params.size() == Instr->getNumOperands() && 319 "Invalid number of operands"); 320 321 // Does this instruction return a value ? 322 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 323 324 Value *UndefVec = IsVoidRetTy ? 0 : 325 UndefValue::get(VectorType::get(Instr->getType(), VF)); 326 // Create a new entry in the WidenMap and initialize it to Undef or Null. 327 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 328 329 // For each scalar that we create: 330 for (unsigned Width = 0; Width < VF; ++Width) { 331 // For each vector unroll 'part': 332 for (unsigned Part = 0; Part < UF; ++Part) { 333 Instruction *Cloned = Instr->clone(); 334 if (!IsVoidRetTy) 335 Cloned->setName(Instr->getName() + ".cloned"); 336 // Replace the operands of the cloned instrucions with extracted scalars. 337 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 338 Value *Op = Params[op][Part]; 339 // Param is a vector. Need to extract the right lane. 340 if (Op->getType()->isVectorTy()) 341 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 342 Cloned->setOperand(op, Op); 343 } 344 345 // Place the cloned scalar in the new loop. 346 Builder.Insert(Cloned); 347 348 // If the original scalar returns a value we need to place it in a vector 349 // so that future users will be able to use it. 350 if (!IsVoidRetTy) 351 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 352 Builder.getInt32(Width)); 353 } 354 } 355} 356 357Value* 358InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 359 Instruction *Loc) { 360 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 361 Legal->getRuntimePointerCheck(); 362 363 if (!PtrRtCheck->Need) 364 return NULL; 365 366 Value *MemoryRuntimeCheck = 0; 367 unsigned NumPointers = PtrRtCheck->Pointers.size(); 368 SmallVector<Value* , 2> Starts; 369 SmallVector<Value* , 2> Ends; 370 371 SCEVExpander Exp(*SE, "induction"); 372 373 // Use this type for pointer arithmetic. 374 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 375 376 for (unsigned i = 0; i < NumPointers; ++i) { 377 Value *Ptr = PtrRtCheck->Pointers[i]; 378 const SCEV *Sc = SE->getSCEV(Ptr); 379 380 if (SE->isLoopInvariant(Sc, OrigLoop)) { 381 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 382 *Ptr <<"\n"); 383 Starts.push_back(Ptr); 384 Ends.push_back(Ptr); 385 } else { 386 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 387 388 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 389 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 390 Starts.push_back(Start); 391 Ends.push_back(End); 392 } 393 } 394 395 for (unsigned i = 0; i < NumPointers; ++i) { 396 for (unsigned j = i+1; j < NumPointers; ++j) { 397 Instruction::CastOps Op = Instruction::BitCast; 398 Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); 399 Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); 400 Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); 401 Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); 402 403 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 404 Start0, End1, "bound0", Loc); 405 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 406 Start1, End0, "bound1", Loc); 407 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 408 "found.conflict", Loc); 409 if (MemoryRuntimeCheck) 410 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 411 MemoryRuntimeCheck, 412 IsConflict, 413 "conflict.rdx", Loc); 414 else 415 MemoryRuntimeCheck = IsConflict; 416 417 } 418 } 419 420 return MemoryRuntimeCheck; 421} 422 423void 424InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 425 /* 426 In this function we generate a new loop. The new loop will contain 427 the vectorized instructions while the old loop will continue to run the 428 scalar remainder. 429 430 [ ] <-- vector loop bypass. 431 / | 432 / v 433 | [ ] <-- vector pre header. 434 | | 435 | v 436 | [ ] \ 437 | [ ]_| <-- vector loop. 438 | | 439 \ v 440 >[ ] <--- middle-block. 441 / | 442 / v 443 | [ ] <--- new preheader. 444 | | 445 | v 446 | [ ] \ 447 | [ ]_| <-- old scalar loop to handle remainder. 448 \ | 449 \ v 450 >[ ] <-- exit block. 451 ... 452 */ 453 454 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 455 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 456 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 457 assert(ExitBlock && "Must have an exit block"); 458 459 // Some loops have a single integer induction variable, while other loops 460 // don't. One example is c++ iterators that often have multiple pointer 461 // induction variables. In the code below we also support a case where we 462 // don't have a single induction variable. 463 OldInduction = Legal->getInduction(); 464 Type *IdxTy = OldInduction ? OldInduction->getType() : 465 DL->getIntPtrType(SE->getContext()); 466 467 // Find the loop boundaries. 468 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 469 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 470 471 // Get the total trip count from the count by adding 1. 472 ExitCount = SE->getAddExpr(ExitCount, 473 SE->getConstant(ExitCount->getType(), 1)); 474 475 // Expand the trip count and place the new instructions in the preheader. 476 // Notice that the pre-header does not change, only the loop body. 477 SCEVExpander Exp(*SE, "induction"); 478 479 // Count holds the overall loop count (N). 480 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 481 BypassBlock->getTerminator()); 482 483 // The loop index does not have to start at Zero. Find the original start 484 // value from the induction PHI node. If we don't have an induction variable 485 // then we know that it starts at zero. 486 Value *StartIdx = OldInduction ? 487 OldInduction->getIncomingValueForBlock(BypassBlock): 488 ConstantInt::get(IdxTy, 0); 489 490 assert(BypassBlock && "Invalid loop structure"); 491 492 // Generate the code that checks in runtime if arrays overlap. 493 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 494 BypassBlock->getTerminator()); 495 496 // Split the single block loop into the two loop structure described above. 497 BasicBlock *VectorPH = 498 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 499 BasicBlock *VecBody = 500 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 501 BasicBlock *MiddleBlock = 502 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 503 BasicBlock *ScalarPH = 504 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 505 506 // This is the location in which we add all of the logic for bypassing 507 // the new vector loop. 508 Instruction *Loc = BypassBlock->getTerminator(); 509 510 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 511 // inside the loop. 512 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 513 514 // Generate the induction variable. 515 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 516 // The loop step is equal to the vectorization factor (num of SIMD elements) 517 // times the unroll factor (num of SIMD instructions). 518 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 519 520 // We may need to extend the index in case there is a type mismatch. 521 // We know that the count starts at zero and does not overflow. 522 if (Count->getType() != IdxTy) { 523 // The exit count can be of pointer type. Convert it to the correct 524 // integer type. 525 if (ExitCount->getType()->isPointerTy()) 526 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 527 else 528 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 529 } 530 531 // Add the start index to the loop count to get the new end index. 532 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 533 534 // Now we need to generate the expression for N - (N % VF), which is 535 // the part that the vectorized body will execute. 536 Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc); 537 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 538 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 539 "end.idx.rnd.down", Loc); 540 541 // Now, compare the new count to zero. If it is zero skip the vector loop and 542 // jump to the scalar loop. 543 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 544 IdxEndRoundDown, 545 StartIdx, 546 "cmp.zero", Loc); 547 548 // If we are using memory runtime checks, include them in. 549 if (MemoryRuntimeCheck) 550 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 551 "CntOrMem", Loc); 552 553 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 554 // Remove the old terminator. 555 Loc->eraseFromParent(); 556 557 // We are going to resume the execution of the scalar loop. 558 // Go over all of the induction variables that we found and fix the 559 // PHIs that are left in the scalar version of the loop. 560 // The starting values of PHI nodes depend on the counter of the last 561 // iteration in the vectorized loop. 562 // If we come from a bypass edge then we need to start from the original 563 // start value. 564 565 // This variable saves the new starting index for the scalar loop. 566 PHINode *ResumeIndex = 0; 567 LoopVectorizationLegality::InductionList::iterator I, E; 568 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 569 for (I = List->begin(), E = List->end(); I != E; ++I) { 570 PHINode *OrigPhi = I->first; 571 LoopVectorizationLegality::InductionInfo II = I->second; 572 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 573 MiddleBlock->getTerminator()); 574 Value *EndValue = 0; 575 switch (II.IK) { 576 case LoopVectorizationLegality::NoInduction: 577 llvm_unreachable("Unknown induction"); 578 case LoopVectorizationLegality::IntInduction: { 579 // Handle the integer induction counter: 580 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 581 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 582 // We know what the end value is. 583 EndValue = IdxEndRoundDown; 584 // We also know which PHI node holds it. 585 ResumeIndex = ResumeVal; 586 break; 587 } 588 case LoopVectorizationLegality::ReverseIntInduction: { 589 // Convert the CountRoundDown variable to the PHI size. 590 unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); 591 unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); 592 Value *CRD = CountRoundDown; 593 if (CRDSize > IISize) 594 CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, 595 II.StartValue->getType(), 596 "tr.crd", BypassBlock->getTerminator()); 597 else if (CRDSize < IISize) 598 CRD = CastInst::Create(Instruction::SExt, CountRoundDown, 599 II.StartValue->getType(), 600 "sext.crd", BypassBlock->getTerminator()); 601 // Handle reverse integer induction counter: 602 EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", 603 BypassBlock->getTerminator()); 604 break; 605 } 606 case LoopVectorizationLegality::PtrInduction: { 607 // For pointer induction variables, calculate the offset using 608 // the end index. 609 EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, 610 "ptr.ind.end", 611 BypassBlock->getTerminator()); 612 break; 613 } 614 }// end of case 615 616 // The new PHI merges the original incoming value, in case of a bypass, 617 // or the value at the end of the vectorized loop. 618 ResumeVal->addIncoming(II.StartValue, BypassBlock); 619 ResumeVal->addIncoming(EndValue, VecBody); 620 621 // Fix the scalar body counter (PHI node). 622 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 623 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 624 } 625 626 // If we are generating a new induction variable then we also need to 627 // generate the code that calculates the exit value. This value is not 628 // simply the end of the counter because we may skip the vectorized body 629 // in case of a runtime check. 630 if (!OldInduction){ 631 assert(!ResumeIndex && "Unexpected resume value found"); 632 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 633 MiddleBlock->getTerminator()); 634 ResumeIndex->addIncoming(StartIdx, BypassBlock); 635 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 636 } 637 638 // Make sure that we found the index where scalar loop needs to continue. 639 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 640 "Invalid resume Index"); 641 642 // Add a check in the middle block to see if we have completed 643 // all of the iterations in the first vector loop. 644 // If (N - N%VF) == N, then we *don't* need to run the remainder. 645 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 646 ResumeIndex, "cmp.n", 647 MiddleBlock->getTerminator()); 648 649 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 650 // Remove the old terminator. 651 MiddleBlock->getTerminator()->eraseFromParent(); 652 653 // Create i+1 and fill the PHINode. 654 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 655 Induction->addIncoming(StartIdx, VectorPH); 656 Induction->addIncoming(NextIdx, VecBody); 657 // Create the compare. 658 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 659 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 660 661 // Now we have two terminators. Remove the old one from the block. 662 VecBody->getTerminator()->eraseFromParent(); 663 664 // Get ready to start creating new instructions into the vectorized body. 665 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 666 667 // Create and register the new vector loop. 668 Loop* Lp = new Loop(); 669 Loop *ParentLoop = OrigLoop->getParentLoop(); 670 671 // Insert the new loop into the loop nest and register the new basic blocks. 672 if (ParentLoop) { 673 ParentLoop->addChildLoop(Lp); 674 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 675 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 676 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 677 } else { 678 LI->addTopLevelLoop(Lp); 679 } 680 681 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 682 683 // Save the state. 684 LoopVectorPreHeader = VectorPH; 685 LoopScalarPreHeader = ScalarPH; 686 LoopMiddleBlock = MiddleBlock; 687 LoopExitBlock = ExitBlock; 688 LoopVectorBody = VecBody; 689 LoopScalarBody = OldBasicBlock; 690 LoopBypassBlock = BypassBlock; 691} 692 693/// This function returns the identity element (or neutral element) for 694/// the operation K. 695static unsigned 696getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 697 switch (K) { 698 case LoopVectorizationLegality::IntegerXor: 699 case LoopVectorizationLegality::IntegerAdd: 700 case LoopVectorizationLegality::IntegerOr: 701 // Adding, Xoring, Oring zero to a number does not change it. 702 return 0; 703 case LoopVectorizationLegality::IntegerMult: 704 // Multiplying a number by 1 does not change it. 705 return 1; 706 case LoopVectorizationLegality::IntegerAnd: 707 // AND-ing a number with an all-1 value does not change it. 708 return -1; 709 default: 710 llvm_unreachable("Unknown reduction kind"); 711 } 712} 713 714static bool 715isTriviallyVectorizableIntrinsic(Instruction *Inst) { 716 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 717 if (!II) 718 return false; 719 switch (II->getIntrinsicID()) { 720 case Intrinsic::sqrt: 721 case Intrinsic::sin: 722 case Intrinsic::cos: 723 case Intrinsic::exp: 724 case Intrinsic::exp2: 725 case Intrinsic::log: 726 case Intrinsic::log10: 727 case Intrinsic::log2: 728 case Intrinsic::fabs: 729 case Intrinsic::floor: 730 case Intrinsic::ceil: 731 case Intrinsic::trunc: 732 case Intrinsic::rint: 733 case Intrinsic::nearbyint: 734 case Intrinsic::pow: 735 case Intrinsic::fma: 736 case Intrinsic::fmuladd: 737 return true; 738 default: 739 return false; 740 } 741 return false; 742} 743 744void 745InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 746 //===------------------------------------------------===// 747 // 748 // Notice: any optimization or new instruction that go 749 // into the code below should be also be implemented in 750 // the cost-model. 751 // 752 //===------------------------------------------------===// 753 BasicBlock &BB = *OrigLoop->getHeader(); 754 Constant *Zero = 755 ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); 756 757 // In order to support reduction variables we need to be able to vectorize 758 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 759 // stages. First, we create a new vector PHI node with no incoming edges. 760 // We use this value when we vectorize all of the instructions that use the 761 // PHI. Next, after all of the instructions in the block are complete we 762 // add the new incoming edges to the PHI. At this point all of the 763 // instructions in the basic block are vectorized, so we can use them to 764 // construct the PHI. 765 PhiVector RdxPHIsToFix; 766 767 // Scan the loop in a topological order to ensure that defs are vectorized 768 // before users. 769 LoopBlocksDFS DFS(OrigLoop); 770 DFS.perform(LI); 771 772 // Vectorize all of the blocks in the original loop. 773 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 774 be = DFS.endRPO(); bb != be; ++bb) 775 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 776 777 // At this point every instruction in the original loop is widened to 778 // a vector form. We are almost done. Now, we need to fix the PHI nodes 779 // that we vectorized. The PHI nodes are currently empty because we did 780 // not want to introduce cycles. Notice that the remaining PHI nodes 781 // that we need to fix are reduction variables. 782 783 // Create the 'reduced' values for each of the induction vars. 784 // The reduced values are the vector values that we scalarize and combine 785 // after the loop is finished. 786 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 787 it != e; ++it) { 788 PHINode *RdxPhi = *it; 789 assert(RdxPhi && "Unable to recover vectorized PHI"); 790 791 // Find the reduction variable descriptor. 792 assert(Legal->getReductionVars()->count(RdxPhi) && 793 "Unable to find the reduction variable"); 794 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 795 (*Legal->getReductionVars())[RdxPhi]; 796 797 // We need to generate a reduction vector from the incoming scalar. 798 // To do so, we need to generate the 'identity' vector and overide 799 // one of the elements with the incoming scalar reduction. We need 800 // to do it in the vector-loop preheader. 801 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 802 803 // This is the vector-clone of the value that leaves the loop. 804 VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 805 Type *VecTy = VectorExit[0]->getType(); 806 807 // Find the reduction identity variable. Zero for addition, or, xor, 808 // one for multiplication, -1 for And. 809 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 810 VecTy->getScalarType()); 811 812 // This vector is the Identity vector where the first element is the 813 // incoming scalar reduction. 814 Value *VectorStart = Builder.CreateInsertElement(Identity, 815 RdxDesc.StartValue, Zero); 816 817 // Fix the vector-loop phi. 818 // We created the induction variable so we know that the 819 // preheader is the first entry. 820 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 821 822 // Reductions do not have to start at zero. They can start with 823 // any loop invariant values. 824 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 825 BasicBlock *Latch = OrigLoop->getLoopLatch(); 826 Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 827 VectorParts &Val = getVectorValue(LoopVal); 828 for (unsigned part = 0; part < UF; ++part) { 829 // Make sure to add the reduction stat value only to the 830 // first unroll part. 831 Value *StartVal = (part == 0) ? VectorStart : Identity; 832 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); 833 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); 834 } 835 836 // Before each round, move the insertion point right between 837 // the PHIs and the values we are going to write. 838 // This allows us to write both PHINodes and the extractelement 839 // instructions. 840 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 841 842 VectorParts RdxParts; 843 for (unsigned part = 0; part < UF; ++part) { 844 // This PHINode contains the vectorized reduction variable, or 845 // the initial value vector, if we bypass the vector loop. 846 VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); 847 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 848 Value *StartVal = (part == 0) ? VectorStart : Identity; 849 NewPhi->addIncoming(StartVal, LoopBypassBlock); 850 NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); 851 RdxParts.push_back(NewPhi); 852 } 853 854 // Reduce all of the unrolled parts into a single vector. 855 Value *ReducedPartRdx = RdxParts[0]; 856 for (unsigned part = 1; part < UF; ++part) { 857 switch (RdxDesc.Kind) { 858 case LoopVectorizationLegality::IntegerAdd: 859 ReducedPartRdx = 860 Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx"); 861 break; 862 case LoopVectorizationLegality::IntegerMult: 863 ReducedPartRdx = 864 Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx"); 865 break; 866 case LoopVectorizationLegality::IntegerOr: 867 ReducedPartRdx = 868 Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx"); 869 break; 870 case LoopVectorizationLegality::IntegerAnd: 871 ReducedPartRdx = 872 Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx"); 873 break; 874 case LoopVectorizationLegality::IntegerXor: 875 ReducedPartRdx = 876 Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx"); 877 break; 878 default: 879 llvm_unreachable("Unknown reduction operation"); 880 } 881 } 882 883 884 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 885 // and vector ops, reducing the set of values being computed by half each 886 // round. 887 assert(isPowerOf2_32(VF) && 888 "Reduction emission only supported for pow2 vectors!"); 889 Value *TmpVec = ReducedPartRdx; 890 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 891 for (unsigned i = VF; i != 1; i >>= 1) { 892 // Move the upper half of the vector to the lower half. 893 for (unsigned j = 0; j != i/2; ++j) 894 ShuffleMask[j] = Builder.getInt32(i/2 + j); 895 896 // Fill the rest of the mask with undef. 897 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 898 UndefValue::get(Builder.getInt32Ty())); 899 900 Value *Shuf = 901 Builder.CreateShuffleVector(TmpVec, 902 UndefValue::get(TmpVec->getType()), 903 ConstantVector::get(ShuffleMask), 904 "rdx.shuf"); 905 906 // Emit the operation on the shuffled value. 907 switch (RdxDesc.Kind) { 908 case LoopVectorizationLegality::IntegerAdd: 909 TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); 910 break; 911 case LoopVectorizationLegality::IntegerMult: 912 TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); 913 break; 914 case LoopVectorizationLegality::IntegerOr: 915 TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); 916 break; 917 case LoopVectorizationLegality::IntegerAnd: 918 TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); 919 break; 920 case LoopVectorizationLegality::IntegerXor: 921 TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); 922 break; 923 default: 924 llvm_unreachable("Unknown reduction operation"); 925 } 926 } 927 928 // The result is in the first element of the vector. 929 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 930 931 // Now, we need to fix the users of the reduction variable 932 // inside and outside of the scalar remainder loop. 933 // We know that the loop is in LCSSA form. We need to update the 934 // PHI nodes in the exit blocks. 935 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 936 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 937 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 938 if (!LCSSAPhi) continue; 939 940 // All PHINodes need to have a single entry edge, or two if 941 // we already fixed them. 942 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 943 944 // We found our reduction value exit-PHI. Update it with the 945 // incoming bypass edge. 946 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 947 // Add an edge coming from the bypass. 948 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 949 break; 950 } 951 }// end of the LCSSA phi scan. 952 953 // Fix the scalar loop reduction variable with the incoming reduction sum 954 // from the vector body and from the backedge value. 955 int IncomingEdgeBlockIdx = 956 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 957 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 958 // Pick the other block. 959 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 960 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 961 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 962 }// end of for each redux variable. 963 964 // The Loop exit block may have single value PHI nodes where the incoming 965 // value is 'undef'. While vectorizing we only handled real values that 966 // were defined inside the loop. Here we handle the 'undef case'. 967 // See PR14725. 968 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 969 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 970 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 971 if (!LCSSAPhi) continue; 972 if (LCSSAPhi->getNumIncomingValues() == 1) 973 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 974 LoopMiddleBlock); 975 } 976} 977 978InnerLoopVectorizer::VectorParts 979InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 980 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 981 "Invalid edge"); 982 983 VectorParts SrcMask = createBlockInMask(Src); 984 985 // The terminator has to be a branch inst! 986 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 987 assert(BI && "Unexpected terminator found"); 988 989 if (BI->isConditional()) { 990 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 991 992 if (BI->getSuccessor(0) != Dst) 993 for (unsigned part = 0; part < UF; ++part) 994 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 995 996 for (unsigned part = 0; part < UF; ++part) 997 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 998 return EdgeMask; 999 } 1000 1001 return SrcMask; 1002} 1003 1004InnerLoopVectorizer::VectorParts 1005InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 1006 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 1007 1008 // Loop incoming mask is all-one. 1009 if (OrigLoop->getHeader() == BB) { 1010 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 1011 return getVectorValue(C); 1012 } 1013 1014 // This is the block mask. We OR all incoming edges, and with zero. 1015 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 1016 VectorParts BlockMask = getVectorValue(Zero); 1017 1018 // For each pred: 1019 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 1020 VectorParts EM = createEdgeMask(*it, BB); 1021 for (unsigned part = 0; part < UF; ++part) 1022 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 1023 } 1024 1025 return BlockMask; 1026} 1027 1028void 1029InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 1030 BasicBlock *BB, PhiVector *PV) { 1031 Constant *Zero = Builder.getInt32(0); 1032 1033 // For each instruction in the old loop. 1034 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1035 VectorParts &Entry = WidenMap.get(it); 1036 switch (it->getOpcode()) { 1037 case Instruction::Br: 1038 // Nothing to do for PHIs and BR, since we already took care of the 1039 // loop control flow instructions. 1040 continue; 1041 case Instruction::PHI:{ 1042 PHINode* P = cast<PHINode>(it); 1043 // Handle reduction variables: 1044 if (Legal->getReductionVars()->count(P)) { 1045 for (unsigned part = 0; part < UF; ++part) { 1046 // This is phase one of vectorizing PHIs. 1047 Type *VecTy = VectorType::get(it->getType(), VF); 1048 Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 1049 LoopVectorBody-> getFirstInsertionPt()); 1050 } 1051 PV->push_back(P); 1052 continue; 1053 } 1054 1055 // Check for PHI nodes that are lowered to vector selects. 1056 if (P->getParent() != OrigLoop->getHeader()) { 1057 // We know that all PHIs in non header blocks are converted into 1058 // selects, so we don't have to worry about the insertion order and we 1059 // can just use the builder. 1060 1061 // At this point we generate the predication tree. There may be 1062 // duplications since this is a simple recursive scan, but future 1063 // optimizations will clean it up. 1064 VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), 1065 P->getParent()); 1066 1067 for (unsigned part = 0; part < UF; ++part) { 1068 VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); 1069 VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); 1070 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], 1071 "predphi"); 1072 } 1073 continue; 1074 } 1075 1076 // This PHINode must be an induction variable. 1077 // Make sure that we know about it. 1078 assert(Legal->getInductionVars()->count(P) && 1079 "Not an induction variable"); 1080 1081 LoopVectorizationLegality::InductionInfo II = 1082 Legal->getInductionVars()->lookup(P); 1083 1084 switch (II.IK) { 1085 case LoopVectorizationLegality::NoInduction: 1086 llvm_unreachable("Unknown induction"); 1087 case LoopVectorizationLegality::IntInduction: { 1088 assert(P == OldInduction && "Unexpected PHI"); 1089 Value *Broadcasted = getBroadcastInstrs(Induction); 1090 // After broadcasting the induction variable we need to make the 1091 // vector consecutive by adding 0, 1, 2 ... 1092 for (unsigned part = 0; part < UF; ++part) 1093 Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); 1094 continue; 1095 } 1096 case LoopVectorizationLegality::ReverseIntInduction: 1097 case LoopVectorizationLegality::PtrInduction: 1098 // Handle reverse integer and pointer inductions. 1099 Value *StartIdx = 0; 1100 // If we have a single integer induction variable then use it. 1101 // Otherwise, start counting at zero. 1102 if (OldInduction) { 1103 LoopVectorizationLegality::InductionInfo OldII = 1104 Legal->getInductionVars()->lookup(OldInduction); 1105 StartIdx = OldII.StartValue; 1106 } else { 1107 StartIdx = ConstantInt::get(Induction->getType(), 0); 1108 } 1109 // This is the normalized GEP that starts counting at zero. 1110 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1111 "normalized.idx"); 1112 1113 // Handle the reverse integer induction variable case. 1114 if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { 1115 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 1116 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 1117 "resize.norm.idx"); 1118 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 1119 "reverse.idx"); 1120 1121 // This is a new value so do not hoist it out. 1122 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 1123 // After broadcasting the induction variable we need to make the 1124 // vector consecutive by adding ... -3, -2, -1, 0. 1125 for (unsigned part = 0; part < UF; ++part) 1126 Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); 1127 continue; 1128 } 1129 1130 // Handle the pointer induction variable case. 1131 assert(P->getType()->isPointerTy() && "Unexpected type."); 1132 1133 // This is the vector of results. Notice that we don't generate 1134 // vector geps because scalar geps result in better code. 1135 for (unsigned part = 0; part < UF; ++part) { 1136 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1137 for (unsigned int i = 0; i < VF; ++i) { 1138 Constant *Idx = ConstantInt::get(Induction->getType(), 1139 i + part * VF); 1140 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, 1141 "gep.idx"); 1142 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 1143 "next.gep"); 1144 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1145 Builder.getInt32(i), 1146 "insert.gep"); 1147 } 1148 Entry[part] = VecVal; 1149 } 1150 continue; 1151 } 1152 1153 }// End of PHI. 1154 1155 case Instruction::Add: 1156 case Instruction::FAdd: 1157 case Instruction::Sub: 1158 case Instruction::FSub: 1159 case Instruction::Mul: 1160 case Instruction::FMul: 1161 case Instruction::UDiv: 1162 case Instruction::SDiv: 1163 case Instruction::FDiv: 1164 case Instruction::URem: 1165 case Instruction::SRem: 1166 case Instruction::FRem: 1167 case Instruction::Shl: 1168 case Instruction::LShr: 1169 case Instruction::AShr: 1170 case Instruction::And: 1171 case Instruction::Or: 1172 case Instruction::Xor: { 1173 // Just widen binops. 1174 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 1175 VectorParts &A = getVectorValue(it->getOperand(0)); 1176 VectorParts &B = getVectorValue(it->getOperand(1)); 1177 1178 // Use this vector value for all users of the original instruction. 1179 for (unsigned Part = 0; Part < UF; ++Part) { 1180 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 1181 1182 // Update the NSW, NUW and Exact flags. 1183 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1184 if (isa<OverflowingBinaryOperator>(BinOp)) { 1185 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1186 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1187 } 1188 if (isa<PossiblyExactOperator>(VecOp)) 1189 VecOp->setIsExact(BinOp->isExact()); 1190 1191 Entry[Part] = V; 1192 } 1193 break; 1194 } 1195 case Instruction::Select: { 1196 // Widen selects. 1197 // If the selector is loop invariant we can create a select 1198 // instruction with a scalar condition. Otherwise, use vector-select. 1199 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 1200 OrigLoop); 1201 1202 // The condition can be loop invariant but still defined inside the 1203 // loop. This means that we can't just use the original 'cond' value. 1204 // We have to take the 'vectorized' value and pick the first lane. 1205 // Instcombine will make this a no-op. 1206 VectorParts &Cond = getVectorValue(it->getOperand(0)); 1207 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 1208 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 1209 Value *ScalarCond = Builder.CreateExtractElement(Cond[0], 1210 Builder.getInt32(0)); 1211 for (unsigned Part = 0; Part < UF; ++Part) { 1212 Entry[Part] = Builder.CreateSelect( 1213 InvariantCond ? ScalarCond : Cond[Part], 1214 Op0[Part], 1215 Op1[Part]); 1216 } 1217 break; 1218 } 1219 1220 case Instruction::ICmp: 1221 case Instruction::FCmp: { 1222 // Widen compares. Generate vector compares. 1223 bool FCmp = (it->getOpcode() == Instruction::FCmp); 1224 CmpInst *Cmp = dyn_cast<CmpInst>(it); 1225 VectorParts &A = getVectorValue(it->getOperand(0)); 1226 VectorParts &B = getVectorValue(it->getOperand(1)); 1227 for (unsigned Part = 0; Part < UF; ++Part) { 1228 Value *C = 0; 1229 if (FCmp) 1230 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 1231 else 1232 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 1233 Entry[Part] = C; 1234 } 1235 break; 1236 } 1237 1238 case Instruction::Store: { 1239 // Attempt to issue a wide store. 1240 StoreInst *SI = dyn_cast<StoreInst>(it); 1241 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1242 Value *Ptr = SI->getPointerOperand(); 1243 unsigned Alignment = SI->getAlignment(); 1244 1245 assert(!Legal->isUniform(Ptr) && 1246 "We do not allow storing to uniform addresses"); 1247 1248 1249 int Stride = Legal->isConsecutivePtr(Ptr); 1250 bool Reverse = Stride < 0; 1251 if (Stride == 0) { 1252 scalarizeInstruction(it); 1253 break; 1254 } 1255 1256 // Handle consecutive stores. 1257 1258 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1259 if (Gep) { 1260 // The last index does not have to be the induction. It can be 1261 // consecutive and be a function of the index. For example A[I+1]; 1262 unsigned NumOperands = Gep->getNumOperands(); 1263 1264 Value *LastGepOperand = Gep->getOperand(NumOperands - 1); 1265 VectorParts &GEPParts = getVectorValue(LastGepOperand); 1266 Value *LastIndex = GEPParts[0]; 1267 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1268 1269 // Create the new GEP with the new induction variable. 1270 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1271 Gep2->setOperand(NumOperands - 1, LastIndex); 1272 Ptr = Builder.Insert(Gep2); 1273 } else { 1274 // Use the induction element ptr. 1275 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1276 VectorParts &PtrVal = getVectorValue(Ptr); 1277 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 1278 } 1279 1280 VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); 1281 for (unsigned Part = 0; Part < UF; ++Part) { 1282 // Calculate the pointer for the specific unroll-part. 1283 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1284 1285 if (Reverse) { 1286 // If we store to reverse consecutive memory locations then we need 1287 // to reverse the order of elements in the stored value. 1288 StoredVal[Part] = reverseVector(StoredVal[Part]); 1289 // If the address is consecutive but reversed, then the 1290 // wide store needs to start at the last vector element. 1291 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1292 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1293 } 1294 1295 Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo()); 1296 Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); 1297 } 1298 break; 1299 } 1300 case Instruction::Load: { 1301 // Attempt to issue a wide load. 1302 LoadInst *LI = dyn_cast<LoadInst>(it); 1303 Type *RetTy = VectorType::get(LI->getType(), VF); 1304 Value *Ptr = LI->getPointerOperand(); 1305 unsigned Alignment = LI->getAlignment(); 1306 1307 // If the pointer is loop invariant or if it is non consecutive, 1308 // scalarize the load. 1309 int Stride = Legal->isConsecutivePtr(Ptr); 1310 bool Reverse = Stride < 0; 1311 if (Legal->isUniform(Ptr) || Stride == 0) { 1312 scalarizeInstruction(it); 1313 break; 1314 } 1315 1316 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1317 if (Gep) { 1318 // The last index does not have to be the induction. It can be 1319 // consecutive and be a function of the index. For example A[I+1]; 1320 unsigned NumOperands = Gep->getNumOperands(); 1321 1322 Value *LastGepOperand = Gep->getOperand(NumOperands - 1); 1323 VectorParts &GEPParts = getVectorValue(LastGepOperand); 1324 Value *LastIndex = GEPParts[0]; 1325 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1326 1327 // Create the new GEP with the new induction variable. 1328 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1329 Gep2->setOperand(NumOperands - 1, LastIndex); 1330 Ptr = Builder.Insert(Gep2); 1331 } else { 1332 // Use the induction element ptr. 1333 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1334 VectorParts &PtrVal = getVectorValue(Ptr); 1335 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 1336 } 1337 1338 for (unsigned Part = 0; Part < UF; ++Part) { 1339 // Calculate the pointer for the specific unroll-part. 1340 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1341 1342 if (Reverse) { 1343 // If the address is consecutive but reversed, then the 1344 // wide store needs to start at the last vector element. 1345 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1346 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1347 } 1348 1349 Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo()); 1350 Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); 1351 cast<LoadInst>(LI)->setAlignment(Alignment); 1352 Entry[Part] = Reverse ? reverseVector(LI) : LI; 1353 } 1354 break; 1355 } 1356 case Instruction::ZExt: 1357 case Instruction::SExt: 1358 case Instruction::FPToUI: 1359 case Instruction::FPToSI: 1360 case Instruction::FPExt: 1361 case Instruction::PtrToInt: 1362 case Instruction::IntToPtr: 1363 case Instruction::SIToFP: 1364 case Instruction::UIToFP: 1365 case Instruction::Trunc: 1366 case Instruction::FPTrunc: 1367 case Instruction::BitCast: { 1368 CastInst *CI = dyn_cast<CastInst>(it); 1369 /// Optimize the special case where the source is the induction 1370 /// variable. Notice that we can only optimize the 'trunc' case 1371 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 1372 /// c. other casts depend on pointer size. 1373 if (CI->getOperand(0) == OldInduction && 1374 it->getOpcode() == Instruction::Trunc) { 1375 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 1376 CI->getType()); 1377 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 1378 for (unsigned Part = 0; Part < UF; ++Part) 1379 Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); 1380 break; 1381 } 1382 /// Vectorize casts. 1383 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1384 1385 VectorParts &A = getVectorValue(it->getOperand(0)); 1386 for (unsigned Part = 0; Part < UF; ++Part) 1387 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 1388 break; 1389 } 1390 1391 case Instruction::Call: { 1392 assert(isTriviallyVectorizableIntrinsic(it)); 1393 Module *M = BB->getParent()->getParent(); 1394 IntrinsicInst *II = cast<IntrinsicInst>(it); 1395 Intrinsic::ID ID = II->getIntrinsicID(); 1396 for (unsigned Part = 0; Part < UF; ++Part) { 1397 SmallVector<Value*, 4> Args; 1398 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) { 1399 VectorParts &Arg = getVectorValue(II->getArgOperand(i)); 1400 Args.push_back(Arg[Part]); 1401 } 1402 Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; 1403 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 1404 Entry[Part] = Builder.CreateCall(F, Args); 1405 } 1406 break; 1407 } 1408 1409 default: 1410 // All other instructions are unsupported. Scalarize them. 1411 scalarizeInstruction(it); 1412 break; 1413 }// end of switch. 1414 }// end of for_each instr. 1415} 1416 1417void InnerLoopVectorizer::updateAnalysis() { 1418 // Forget the original basic block. 1419 SE->forgetLoop(OrigLoop); 1420 1421 // Update the dominator tree information. 1422 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1423 "Entry does not dominate exit."); 1424 1425 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1426 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1427 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1428 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1429 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1430 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1431 1432 DEBUG(DT->verifyAnalysis()); 1433} 1434 1435bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1436 if (!EnableIfConversion) 1437 return false; 1438 1439 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1440 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 1441 1442 // Collect the blocks that need predication. 1443 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 1444 BasicBlock *BB = LoopBlocks[i]; 1445 1446 // We don't support switch statements inside loops. 1447 if (!isa<BranchInst>(BB->getTerminator())) 1448 return false; 1449 1450 // We must have at most two predecessors because we need to convert 1451 // all PHIs to selects. 1452 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 1453 if (Preds > 2) 1454 return false; 1455 1456 // We must be able to predicate all blocks that need to be predicated. 1457 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 1458 return false; 1459 } 1460 1461 // We can if-convert this loop. 1462 return true; 1463} 1464 1465bool LoopVectorizationLegality::canVectorize() { 1466 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 1467 1468 // We can only vectorize innermost loops. 1469 if (TheLoop->getSubLoopsVector().size()) 1470 return false; 1471 1472 // We must have a single backedge. 1473 if (TheLoop->getNumBackEdges() != 1) 1474 return false; 1475 1476 // We must have a single exiting block. 1477 if (!TheLoop->getExitingBlock()) 1478 return false; 1479 1480 unsigned NumBlocks = TheLoop->getNumBlocks(); 1481 1482 // Check if we can if-convert non single-bb loops. 1483 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1484 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1485 return false; 1486 } 1487 1488 // We need to have a loop header. 1489 BasicBlock *Latch = TheLoop->getLoopLatch(); 1490 DEBUG(dbgs() << "LV: Found a loop: " << 1491 TheLoop->getHeader()->getName() << "\n"); 1492 1493 // ScalarEvolution needs to be able to find the exit count. 1494 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 1495 if (ExitCount == SE->getCouldNotCompute()) { 1496 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1497 return false; 1498 } 1499 1500 // Do not loop-vectorize loops with a tiny trip count. 1501 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 1502 if (TC > 0u && TC < TinyTripCountThreshold) { 1503 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1504 "This loop is not worth vectorizing.\n"); 1505 return false; 1506 } 1507 1508 // Check if we can vectorize the instructions and CFG in this loop. 1509 if (!canVectorizeInstrs()) { 1510 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1511 return false; 1512 } 1513 1514 // Go over each instruction and look at memory deps. 1515 if (!canVectorizeMemory()) { 1516 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1517 return false; 1518 } 1519 1520 // Collect all of the variables that remain uniform after vectorization. 1521 collectLoopUniforms(); 1522 1523 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1524 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1525 <<"!\n"); 1526 1527 // Okay! We can vectorize. At this point we don't have any other mem analysis 1528 // which may limit our maximum vectorization factor, so just return true with 1529 // no restrictions. 1530 return true; 1531} 1532 1533bool LoopVectorizationLegality::canVectorizeInstrs() { 1534 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1535 BasicBlock *Header = TheLoop->getHeader(); 1536 1537 // For each block in the loop. 1538 for (Loop::block_iterator bb = TheLoop->block_begin(), 1539 be = TheLoop->block_end(); bb != be; ++bb) { 1540 1541 // Scan the instructions in the block and look for hazards. 1542 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1543 ++it) { 1544 1545 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 1546 // This should not happen because the loop should be normalized. 1547 if (Phi->getNumIncomingValues() != 2) { 1548 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1549 return false; 1550 } 1551 1552 // Check that this PHI type is allowed. 1553 if (!Phi->getType()->isIntegerTy() && 1554 !Phi->getType()->isPointerTy()) { 1555 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1556 return false; 1557 } 1558 1559 // If this PHINode is not in the header block, then we know that we 1560 // can convert it to select during if-conversion. No need to check if 1561 // the PHIs in this block are induction or reduction variables. 1562 if (*bb != Header) 1563 continue; 1564 1565 // This is the value coming from the preheader. 1566 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1567 // Check if this is an induction variable. 1568 InductionKind IK = isInductionVariable(Phi); 1569 1570 if (NoInduction != IK) { 1571 // Int inductions are special because we only allow one IV. 1572 if (IK == IntInduction) { 1573 if (Induction) { 1574 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1575 return false; 1576 } 1577 Induction = Phi; 1578 } 1579 1580 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 1581 Inductions[Phi] = InductionInfo(StartValue, IK); 1582 continue; 1583 } 1584 1585 if (AddReductionVar(Phi, IntegerAdd)) { 1586 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1587 continue; 1588 } 1589 if (AddReductionVar(Phi, IntegerMult)) { 1590 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1591 continue; 1592 } 1593 if (AddReductionVar(Phi, IntegerOr)) { 1594 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1595 continue; 1596 } 1597 if (AddReductionVar(Phi, IntegerAnd)) { 1598 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1599 continue; 1600 } 1601 if (AddReductionVar(Phi, IntegerXor)) { 1602 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1603 continue; 1604 } 1605 1606 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1607 return false; 1608 }// end of PHI handling 1609 1610 // We still don't handle functions. 1611 CallInst *CI = dyn_cast<CallInst>(it); 1612 if (CI && !isTriviallyVectorizableIntrinsic(it)) { 1613 DEBUG(dbgs() << "LV: Found a call site.\n"); 1614 return false; 1615 } 1616 1617 // Check that the instruction return type is vectorizable. 1618 if (!VectorType::isValidElementType(it->getType()) && 1619 !it->getType()->isVoidTy()) { 1620 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1621 return false; 1622 } 1623 1624 // Check that the stored type is vectorizable. 1625 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 1626 Type *T = ST->getValueOperand()->getType(); 1627 if (!VectorType::isValidElementType(T)) 1628 return false; 1629 } 1630 1631 // Reduction instructions are allowed to have exit users. 1632 // All other instructions must not have external users. 1633 if (!AllowedExit.count(it)) 1634 //Check that all of the users of the loop are inside the BB. 1635 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 1636 I != E; ++I) { 1637 Instruction *U = cast<Instruction>(*I); 1638 // This user may be a reduction exit value. 1639 if (!TheLoop->contains(U)) { 1640 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1641 return false; 1642 } 1643 } 1644 } // next instr. 1645 1646 } 1647 1648 if (!Induction) { 1649 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1650 assert(getInductionVars()->size() && "No induction variables"); 1651 } 1652 1653 return true; 1654} 1655 1656void LoopVectorizationLegality::collectLoopUniforms() { 1657 // We now know that the loop is vectorizable! 1658 // Collect variables that will remain uniform after vectorization. 1659 std::vector<Value*> Worklist; 1660 BasicBlock *Latch = TheLoop->getLoopLatch(); 1661 1662 // Start with the conditional branch and walk up the block. 1663 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 1664 1665 while (Worklist.size()) { 1666 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1667 Worklist.pop_back(); 1668 1669 // Look at instructions inside this loop. 1670 // Stop when reaching PHI nodes. 1671 // TODO: we need to follow values all over the loop, not only in this block. 1672 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 1673 continue; 1674 1675 // This is a known uniform. 1676 Uniforms.insert(I); 1677 1678 // Insert all operands. 1679 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 1680 Worklist.push_back(I->getOperand(i)); 1681 } 1682 } 1683} 1684 1685bool LoopVectorizationLegality::canVectorizeMemory() { 1686 typedef SmallVector<Value*, 16> ValueVector; 1687 typedef SmallPtrSet<Value*, 16> ValueSet; 1688 // Holds the Load and Store *instructions*. 1689 ValueVector Loads; 1690 ValueVector Stores; 1691 PtrRtCheck.Pointers.clear(); 1692 PtrRtCheck.Need = false; 1693 1694 // For each block. 1695 for (Loop::block_iterator bb = TheLoop->block_begin(), 1696 be = TheLoop->block_end(); bb != be; ++bb) { 1697 1698 // Scan the BB and collect legal loads and stores. 1699 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1700 ++it) { 1701 1702 // If this is a load, save it. If this instruction can read from memory 1703 // but is not a load, then we quit. Notice that we don't handle function 1704 // calls that read or write. 1705 if (it->mayReadFromMemory()) { 1706 LoadInst *Ld = dyn_cast<LoadInst>(it); 1707 if (!Ld) return false; 1708 if (!Ld->isSimple()) { 1709 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1710 return false; 1711 } 1712 Loads.push_back(Ld); 1713 continue; 1714 } 1715 1716 // Save 'store' instructions. Abort if other instructions write to memory. 1717 if (it->mayWriteToMemory()) { 1718 StoreInst *St = dyn_cast<StoreInst>(it); 1719 if (!St) return false; 1720 if (!St->isSimple()) { 1721 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1722 return false; 1723 } 1724 Stores.push_back(St); 1725 } 1726 } // next instr. 1727 } // next block. 1728 1729 // Now we have two lists that hold the loads and the stores. 1730 // Next, we find the pointers that they use. 1731 1732 // Check if we see any stores. If there are no stores, then we don't 1733 // care if the pointers are *restrict*. 1734 if (!Stores.size()) { 1735 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1736 return true; 1737 } 1738 1739 // Holds the read and read-write *pointers* that we find. 1740 ValueVector Reads; 1741 ValueVector ReadWrites; 1742 1743 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1744 // multiple times on the same object. If the ptr is accessed twice, once 1745 // for read and once for write, it will only appear once (on the write 1746 // list). This is okay, since we are going to check for conflicts between 1747 // writes and between reads and writes, but not between reads and reads. 1748 ValueSet Seen; 1749 1750 ValueVector::iterator I, IE; 1751 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1752 StoreInst *ST = cast<StoreInst>(*I); 1753 Value* Ptr = ST->getPointerOperand(); 1754 1755 if (isUniform(Ptr)) { 1756 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1757 return false; 1758 } 1759 1760 // If we did *not* see this pointer before, insert it to 1761 // the read-write list. At this phase it is only a 'write' list. 1762 if (Seen.insert(Ptr)) 1763 ReadWrites.push_back(Ptr); 1764 } 1765 1766 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1767 LoadInst *LD = cast<LoadInst>(*I); 1768 Value* Ptr = LD->getPointerOperand(); 1769 // If we did *not* see this pointer before, insert it to the 1770 // read list. If we *did* see it before, then it is already in 1771 // the read-write list. This allows us to vectorize expressions 1772 // such as A[i] += x; Because the address of A[i] is a read-write 1773 // pointer. This only works if the index of A[i] is consecutive. 1774 // If the address of i is unknown (for example A[B[i]]) then we may 1775 // read a few words, modify, and write a few words, and some of the 1776 // words may be written to the same address. 1777 if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) 1778 Reads.push_back(Ptr); 1779 } 1780 1781 // If we write (or read-write) to a single destination and there are no 1782 // other reads in this loop then is it safe to vectorize. 1783 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1784 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1785 return true; 1786 } 1787 1788 // Find pointers with computable bounds. We are going to use this information 1789 // to place a runtime bound check. 1790 bool CanDoRT = true; 1791 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1792 if (hasComputableBounds(*I)) { 1793 PtrRtCheck.insert(SE, TheLoop, *I); 1794 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1795 } else { 1796 CanDoRT = false; 1797 break; 1798 } 1799 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1800 if (hasComputableBounds(*I)) { 1801 PtrRtCheck.insert(SE, TheLoop, *I); 1802 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1803 } else { 1804 CanDoRT = false; 1805 break; 1806 } 1807 1808 // Check that we did not collect too many pointers or found a 1809 // unsizeable pointer. 1810 if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1811 PtrRtCheck.reset(); 1812 CanDoRT = false; 1813 } 1814 1815 if (CanDoRT) { 1816 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1817 } 1818 1819 bool NeedRTCheck = false; 1820 1821 // Now that the pointers are in two lists (Reads and ReadWrites), we 1822 // can check that there are no conflicts between each of the writes and 1823 // between the writes to the reads. 1824 ValueSet WriteObjects; 1825 ValueVector TempObjects; 1826 1827 // Check that the read-writes do not conflict with other read-write 1828 // pointers. 1829 bool AllWritesIdentified = true; 1830 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1831 GetUnderlyingObjects(*I, TempObjects, DL); 1832 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1833 it != e; ++it) { 1834 if (!isIdentifiedObject(*it)) { 1835 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1836 NeedRTCheck = true; 1837 AllWritesIdentified = false; 1838 } 1839 if (!WriteObjects.insert(*it)) { 1840 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1841 << **it <<"\n"); 1842 return false; 1843 } 1844 } 1845 TempObjects.clear(); 1846 } 1847 1848 /// Check that the reads don't conflict with the read-writes. 1849 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1850 GetUnderlyingObjects(*I, TempObjects, DL); 1851 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1852 it != e; ++it) { 1853 // If all of the writes are identified then we don't care if the read 1854 // pointer is identified or not. 1855 if (!AllWritesIdentified && !isIdentifiedObject(*it)) { 1856 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1857 NeedRTCheck = true; 1858 } 1859 if (WriteObjects.count(*it)) { 1860 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1861 << **it <<"\n"); 1862 return false; 1863 } 1864 } 1865 TempObjects.clear(); 1866 } 1867 1868 PtrRtCheck.Need = NeedRTCheck; 1869 if (NeedRTCheck && !CanDoRT) { 1870 DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 1871 "the array bounds.\n"); 1872 PtrRtCheck.reset(); 1873 return false; 1874 } 1875 1876 DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << 1877 " need a runtime memory check.\n"); 1878 return true; 1879} 1880 1881bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1882 ReductionKind Kind) { 1883 if (Phi->getNumIncomingValues() != 2) 1884 return false; 1885 1886 // Reduction variables are only found in the loop header block. 1887 if (Phi->getParent() != TheLoop->getHeader()) 1888 return false; 1889 1890 // Obtain the reduction start value from the value that comes from the loop 1891 // preheader. 1892 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 1893 1894 // ExitInstruction is the single value which is used outside the loop. 1895 // We only allow for a single reduction value to be used outside the loop. 1896 // This includes users of the reduction, variables (which form a cycle 1897 // which ends in the phi node). 1898 Instruction *ExitInstruction = 0; 1899 1900 // Iter is our iterator. We start with the PHI node and scan for all of the 1901 // users of this instruction. All users must be instructions that can be 1902 // used as reduction variables (such as ADD). We may have a single 1903 // out-of-block user. The cycle must end with the original PHI. 1904 Instruction *Iter = Phi; 1905 while (true) { 1906 // If the instruction has no users then this is a broken 1907 // chain and can't be a reduction variable. 1908 if (Iter->use_empty()) 1909 return false; 1910 1911 // Did we find a user inside this loop already ? 1912 bool FoundInBlockUser = false; 1913 // Did we reach the initial PHI node already ? 1914 bool FoundStartPHI = false; 1915 1916 // For each of the *users* of iter. 1917 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1918 it != e; ++it) { 1919 Instruction *U = cast<Instruction>(*it); 1920 // We already know that the PHI is a user. 1921 if (U == Phi) { 1922 FoundStartPHI = true; 1923 continue; 1924 } 1925 1926 // Check if we found the exit user. 1927 BasicBlock *Parent = U->getParent(); 1928 if (!TheLoop->contains(Parent)) { 1929 // Exit if you find multiple outside users. 1930 if (ExitInstruction != 0) 1931 return false; 1932 ExitInstruction = Iter; 1933 } 1934 1935 // We allow in-loop PHINodes which are not the original reduction PHI 1936 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 1937 // structure) then don't skip this PHI. 1938 if (isa<PHINode>(Iter) && isa<PHINode>(U) && 1939 U->getParent() != TheLoop->getHeader() && 1940 TheLoop->contains(U) && 1941 Iter->getNumUses() > 1) 1942 continue; 1943 1944 // We can't have multiple inside users. 1945 if (FoundInBlockUser) 1946 return false; 1947 FoundInBlockUser = true; 1948 1949 // Any reduction instr must be of one of the allowed kinds. 1950 if (!isReductionInstr(U, Kind)) 1951 return false; 1952 1953 // Reductions of instructions such as Div, and Sub is only 1954 // possible if the LHS is the reduction variable. 1955 if (!U->isCommutative() && U->getOperand(0) != Iter) 1956 return false; 1957 1958 Iter = U; 1959 } 1960 1961 // We found a reduction var if we have reached the original 1962 // phi node and we only have a single instruction with out-of-loop 1963 // users. 1964 if (FoundStartPHI && ExitInstruction) { 1965 // This instruction is allowed to have out-of-loop users. 1966 AllowedExit.insert(ExitInstruction); 1967 1968 // Save the description of this reduction variable. 1969 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1970 Reductions[Phi] = RD; 1971 return true; 1972 } 1973 1974 // If we've reached the start PHI but did not find an outside user then 1975 // this is dead code. Abort. 1976 if (FoundStartPHI) 1977 return false; 1978 } 1979} 1980 1981bool 1982LoopVectorizationLegality::isReductionInstr(Instruction *I, 1983 ReductionKind Kind) { 1984 switch (I->getOpcode()) { 1985 default: 1986 return false; 1987 case Instruction::PHI: 1988 // possibly. 1989 return true; 1990 case Instruction::Sub: 1991 case Instruction::Add: 1992 return Kind == IntegerAdd; 1993 case Instruction::SDiv: 1994 case Instruction::UDiv: 1995 case Instruction::Mul: 1996 return Kind == IntegerMult; 1997 case Instruction::And: 1998 return Kind == IntegerAnd; 1999 case Instruction::Or: 2000 return Kind == IntegerOr; 2001 case Instruction::Xor: 2002 return Kind == IntegerXor; 2003 } 2004} 2005 2006LoopVectorizationLegality::InductionKind 2007LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 2008 Type *PhiTy = Phi->getType(); 2009 // We only handle integer and pointer inductions variables. 2010 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 2011 return NoInduction; 2012 2013 // Check that the PHI is consecutive and starts at zero. 2014 const SCEV *PhiScev = SE->getSCEV(Phi); 2015 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2016 if (!AR) { 2017 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 2018 return NoInduction; 2019 } 2020 const SCEV *Step = AR->getStepRecurrence(*SE); 2021 2022 // Integer inductions need to have a stride of one. 2023 if (PhiTy->isIntegerTy()) { 2024 if (Step->isOne()) 2025 return IntInduction; 2026 if (Step->isAllOnesValue()) 2027 return ReverseIntInduction; 2028 return NoInduction; 2029 } 2030 2031 // Calculate the pointer stride and check if it is consecutive. 2032 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 2033 if (!C) 2034 return NoInduction; 2035 2036 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 2037 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 2038 if (C->getValue()->equalsInt(Size)) 2039 return PtrInduction; 2040 2041 return NoInduction; 2042} 2043 2044bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 2045 Value *In0 = const_cast<Value*>(V); 2046 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 2047 if (!PN) 2048 return false; 2049 2050 return Inductions.count(PN); 2051} 2052 2053bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 2054 assert(TheLoop->contains(BB) && "Unknown block used"); 2055 2056 // Blocks that do not dominate the latch need predication. 2057 BasicBlock* Latch = TheLoop->getLoopLatch(); 2058 return !DT->dominates(BB, Latch); 2059} 2060 2061bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 2062 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2063 // We don't predicate loads/stores at the moment. 2064 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 2065 return false; 2066 2067 // The instructions below can trap. 2068 switch (it->getOpcode()) { 2069 default: continue; 2070 case Instruction::UDiv: 2071 case Instruction::SDiv: 2072 case Instruction::URem: 2073 case Instruction::SRem: 2074 return false; 2075 } 2076 } 2077 2078 return true; 2079} 2080 2081bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 2082 const SCEV *PhiScev = SE->getSCEV(Ptr); 2083 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2084 if (!AR) 2085 return false; 2086 2087 return AR->isAffine(); 2088} 2089 2090unsigned 2091LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 2092 unsigned UserVF) { 2093 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 2094 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 2095 return 1; 2096 } 2097 2098 // Find the trip count. 2099 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 2100 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 2101 2102 unsigned VF = MaxVectorSize; 2103 2104 // If we optimize the program for size, avoid creating the tail loop. 2105 if (OptForSize) { 2106 // If we are unable to calculate the trip count then don't try to vectorize. 2107 if (TC < 2) { 2108 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 2109 return 1; 2110 } 2111 2112 // Find the maximum SIMD width that can fit within the trip count. 2113 VF = TC % MaxVectorSize; 2114 2115 if (VF == 0) 2116 VF = MaxVectorSize; 2117 2118 // If the trip count that we found modulo the vectorization factor is not 2119 // zero then we require a tail. 2120 if (VF < 2) { 2121 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 2122 return 1; 2123 } 2124 } 2125 2126 if (UserVF != 0) { 2127 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 2128 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 2129 2130 return UserVF; 2131 } 2132 2133 if (!TTI) { 2134 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 2135 return 1; 2136 } 2137 2138 float Cost = expectedCost(1); 2139 unsigned Width = 1; 2140 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 2141 for (unsigned i=2; i <= VF; i*=2) { 2142 // Notice that the vector loop needs to be executed less times, so 2143 // we need to divide the cost of the vector loops by the width of 2144 // the vector elements. 2145 float VectorCost = expectedCost(i) / (float)i; 2146 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 2147 (int)VectorCost << ".\n"); 2148 if (VectorCost < Cost) { 2149 Cost = VectorCost; 2150 Width = i; 2151 } 2152 } 2153 2154 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 2155 return Width; 2156} 2157 2158unsigned 2159LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, 2160 unsigned UserUF) { 2161 // Use the user preference, unless 'auto' is selected. 2162 if (UserUF != 0) 2163 return UserUF; 2164 2165 // When we optimize for size we don't unroll. 2166 if (OptForSize) 2167 return 1; 2168 2169 unsigned TargetVectorRegisters = TTI->getNumberOfRegisters(true); 2170 DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << 2171 " vector registers\n"); 2172 2173 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 2174 // We divide by these constants so assume that we have at least one 2175 // instruction that uses at least one register. 2176 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 2177 R.NumInstructions = std::max(R.NumInstructions, 1U); 2178 2179 // We calculate the unroll factor using the following formula. 2180 // Subtract the number of loop invariants from the number of available 2181 // registers. These registers are used by all of the unrolled instances. 2182 // Next, divide the remaining registers by the number of registers that is 2183 // required by the loop, in order to estimate how many parallel instances 2184 // fit without causing spills. 2185 unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; 2186 2187 // We don't want to unroll the loops to the point where they do not fit into 2188 // the decoded cache. Assume that we only allow 32 IR instructions. 2189 UF = std::min(UF, (32 / R.NumInstructions)); 2190 2191 // Clamp the unroll factor ranges to reasonable factors. 2192 if (UF > MaxUnrollSize) 2193 UF = MaxUnrollSize; 2194 else if (UF < 1) 2195 UF = 1; 2196 2197 return UF; 2198} 2199 2200LoopVectorizationCostModel::RegisterUsage 2201LoopVectorizationCostModel::calculateRegisterUsage() { 2202 // This function calculates the register usage by measuring the highest number 2203 // of values that are alive at a single location. Obviously, this is a very 2204 // rough estimation. We scan the loop in a topological order in order and 2205 // assign a number to each instruction. We use RPO to ensure that defs are 2206 // met before their users. We assume that each instruction that has in-loop 2207 // users starts an interval. We record every time that an in-loop value is 2208 // used, so we have a list of the first and last occurrences of each 2209 // instruction. Next, we transpose this data structure into a multi map that 2210 // holds the list of intervals that *end* at a specific location. This multi 2211 // map allows us to perform a linear search. We scan the instructions linearly 2212 // and record each time that a new interval starts, by placing it in a set. 2213 // If we find this value in the multi-map then we remove it from the set. 2214 // The max register usage is the maximum size of the set. 2215 // We also search for instructions that are defined outside the loop, but are 2216 // used inside the loop. We need this number separately from the max-interval 2217 // usage number because when we unroll, loop-invariant values do not take 2218 // more register. 2219 LoopBlocksDFS DFS(TheLoop); 2220 DFS.perform(LI); 2221 2222 RegisterUsage R; 2223 R.NumInstructions = 0; 2224 2225 // Each 'key' in the map opens a new interval. The values 2226 // of the map are the index of the 'last seen' usage of the 2227 // instruction that is the key. 2228 typedef DenseMap<Instruction*, unsigned> IntervalMap; 2229 // Maps instruction to its index. 2230 DenseMap<unsigned, Instruction*> IdxToInstr; 2231 // Marks the end of each interval. 2232 IntervalMap EndPoint; 2233 // Saves the list of instruction indices that are used in the loop. 2234 SmallSet<Instruction*, 8> Ends; 2235 // Saves the list of values that are used in the loop but are 2236 // defined outside the loop, such as arguments and constants. 2237 SmallPtrSet<Value*, 8> LoopInvariants; 2238 2239 unsigned Index = 0; 2240 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 2241 be = DFS.endRPO(); bb != be; ++bb) { 2242 R.NumInstructions += (*bb)->size(); 2243 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2244 ++it) { 2245 Instruction *I = it; 2246 IdxToInstr[Index++] = I; 2247 2248 // Save the end location of each USE. 2249 for (unsigned i = 0; i < I->getNumOperands(); ++i) { 2250 Value *U = I->getOperand(i); 2251 Instruction *Instr = dyn_cast<Instruction>(U); 2252 2253 // Ignore non-instruction values such as arguments, constants, etc. 2254 if (!Instr) continue; 2255 2256 // If this instruction is outside the loop then record it and continue. 2257 if (!TheLoop->contains(Instr)) { 2258 LoopInvariants.insert(Instr); 2259 continue; 2260 } 2261 2262 // Overwrite previous end points. 2263 EndPoint[Instr] = Index; 2264 Ends.insert(Instr); 2265 } 2266 } 2267 } 2268 2269 // Saves the list of intervals that end with the index in 'key'. 2270 typedef SmallVector<Instruction*, 2> InstrList; 2271 DenseMap<unsigned, InstrList> TransposeEnds; 2272 2273 // Transpose the EndPoints to a list of values that end at each index. 2274 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 2275 it != e; ++it) 2276 TransposeEnds[it->second].push_back(it->first); 2277 2278 SmallSet<Instruction*, 8> OpenIntervals; 2279 unsigned MaxUsage = 0; 2280 2281 2282 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 2283 for (unsigned int i = 0; i < Index; ++i) { 2284 Instruction *I = IdxToInstr[i]; 2285 // Ignore instructions that are never used within the loop. 2286 if (!Ends.count(I)) continue; 2287 2288 // Remove all of the instructions that end at this location. 2289 InstrList &List = TransposeEnds[i]; 2290 for (unsigned int j=0, e = List.size(); j < e; ++j) 2291 OpenIntervals.erase(List[j]); 2292 2293 // Count the number of live interals. 2294 MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 2295 2296 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 2297 OpenIntervals.size() <<"\n"); 2298 2299 // Add the current instruction to the list of open intervals. 2300 OpenIntervals.insert(I); 2301 } 2302 2303 unsigned Invariant = LoopInvariants.size(); 2304 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); 2305 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); 2306 DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); 2307 2308 R.LoopInvariantRegs = Invariant; 2309 R.MaxLocalUsers = MaxUsage; 2310 return R; 2311} 2312 2313unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 2314 unsigned Cost = 0; 2315 2316 // For each block. 2317 for (Loop::block_iterator bb = TheLoop->block_begin(), 2318 be = TheLoop->block_end(); bb != be; ++bb) { 2319 unsigned BlockCost = 0; 2320 BasicBlock *BB = *bb; 2321 2322 // For each instruction in the old loop. 2323 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2324 unsigned C = getInstructionCost(it, VF); 2325 Cost += C; 2326 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 2327 VF << " For instruction: "<< *it << "\n"); 2328 } 2329 2330 // We assume that if-converted blocks have a 50% chance of being executed. 2331 // When the code is scalar then some of the blocks are avoided due to CF. 2332 // When the code is vectorized we execute all code paths. 2333 if (Legal->blockNeedsPredication(*bb) && VF == 1) 2334 BlockCost /= 2; 2335 2336 Cost += BlockCost; 2337 } 2338 2339 return Cost; 2340} 2341 2342unsigned 2343LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 2344 assert(TTI && "Invalid vector target transformation info"); 2345 2346 // If we know that this instruction will remain uniform, check the cost of 2347 // the scalar version. 2348 if (Legal->isUniformAfterVectorization(I)) 2349 VF = 1; 2350 2351 Type *RetTy = I->getType(); 2352 Type *VectorTy = ToVectorTy(RetTy, VF); 2353 2354 // TODO: We need to estimate the cost of intrinsic calls. 2355 switch (I->getOpcode()) { 2356 case Instruction::GetElementPtr: 2357 // We mark this instruction as zero-cost because scalar GEPs are usually 2358 // lowered to the intruction addressing mode. At the moment we don't 2359 // generate vector geps. 2360 return 0; 2361 case Instruction::Br: { 2362 return TTI->getCFInstrCost(I->getOpcode()); 2363 } 2364 case Instruction::PHI: 2365 //TODO: IF-converted IFs become selects. 2366 return 0; 2367 case Instruction::Add: 2368 case Instruction::FAdd: 2369 case Instruction::Sub: 2370 case Instruction::FSub: 2371 case Instruction::Mul: 2372 case Instruction::FMul: 2373 case Instruction::UDiv: 2374 case Instruction::SDiv: 2375 case Instruction::FDiv: 2376 case Instruction::URem: 2377 case Instruction::SRem: 2378 case Instruction::FRem: 2379 case Instruction::Shl: 2380 case Instruction::LShr: 2381 case Instruction::AShr: 2382 case Instruction::And: 2383 case Instruction::Or: 2384 case Instruction::Xor: 2385 return TTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 2386 case Instruction::Select: { 2387 SelectInst *SI = cast<SelectInst>(I); 2388 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 2389 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2390 Type *CondTy = SI->getCondition()->getType(); 2391 if (ScalarCond) 2392 CondTy = VectorType::get(CondTy, VF); 2393 2394 return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2395 } 2396 case Instruction::ICmp: 2397 case Instruction::FCmp: { 2398 Type *ValTy = I->getOperand(0)->getType(); 2399 VectorTy = ToVectorTy(ValTy, VF); 2400 return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2401 } 2402 case Instruction::Store: { 2403 StoreInst *SI = cast<StoreInst>(I); 2404 Type *ValTy = SI->getValueOperand()->getType(); 2405 VectorTy = ToVectorTy(ValTy, VF); 2406 2407 if (VF == 1) 2408 return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2409 SI->getAlignment(), 2410 SI->getPointerAddressSpace()); 2411 2412 // Scalarized stores. 2413 int Stride = Legal->isConsecutivePtr(SI->getPointerOperand()); 2414 bool Reverse = Stride < 0; 2415 if (0 == Stride) { 2416 unsigned Cost = 0; 2417 2418 // The cost of extracting from the value vector and pointer vector. 2419 Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2420 for (unsigned i = 0; i < VF; ++i) { 2421 Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, 2422 VectorTy, i); 2423 Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, 2424 PtrTy, i); 2425 } 2426 2427 // The cost of the scalar stores. 2428 Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), 2429 ValTy->getScalarType(), 2430 SI->getAlignment(), 2431 SI->getPointerAddressSpace()); 2432 return Cost; 2433 } 2434 2435 // Wide stores. 2436 unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2437 SI->getAlignment(), 2438 SI->getPointerAddressSpace()); 2439 if (Reverse) 2440 Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, 2441 VectorTy, 0); 2442 return Cost; 2443 } 2444 case Instruction::Load: { 2445 LoadInst *LI = cast<LoadInst>(I); 2446 2447 if (VF == 1) 2448 return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2449 LI->getAlignment(), 2450 LI->getPointerAddressSpace()); 2451 2452 // Scalarized loads. 2453 int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); 2454 bool Reverse = Stride < 0; 2455 if (0 == Stride) { 2456 unsigned Cost = 0; 2457 Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2458 2459 // The cost of extracting from the pointer vector. 2460 for (unsigned i = 0; i < VF; ++i) 2461 Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, 2462 PtrTy, i); 2463 2464 // The cost of inserting data to the result vector. 2465 for (unsigned i = 0; i < VF; ++i) 2466 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, 2467 VectorTy, i); 2468 2469 // The cost of the scalar stores. 2470 Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), 2471 RetTy->getScalarType(), 2472 LI->getAlignment(), 2473 LI->getPointerAddressSpace()); 2474 return Cost; 2475 } 2476 2477 // Wide loads. 2478 unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2479 LI->getAlignment(), 2480 LI->getPointerAddressSpace()); 2481 if (Reverse) 2482 Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, 2483 VectorTy, 0); 2484 return Cost; 2485 } 2486 case Instruction::ZExt: 2487 case Instruction::SExt: 2488 case Instruction::FPToUI: 2489 case Instruction::FPToSI: 2490 case Instruction::FPExt: 2491 case Instruction::PtrToInt: 2492 case Instruction::IntToPtr: 2493 case Instruction::SIToFP: 2494 case Instruction::UIToFP: 2495 case Instruction::Trunc: 2496 case Instruction::FPTrunc: 2497 case Instruction::BitCast: { 2498 // We optimize the truncation of induction variable. 2499 // The cost of these is the same as the scalar operation. 2500 if (I->getOpcode() == Instruction::Trunc && 2501 Legal->isInductionVariable(I->getOperand(0))) 2502 return TTI->getCastInstrCost(I->getOpcode(), I->getType(), 2503 I->getOperand(0)->getType()); 2504 2505 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2506 return TTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2507 } 2508 case Instruction::Call: { 2509 assert(isTriviallyVectorizableIntrinsic(I)); 2510 IntrinsicInst *II = cast<IntrinsicInst>(I); 2511 Type *RetTy = ToVectorTy(II->getType(), VF); 2512 SmallVector<Type*, 4> Tys; 2513 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 2514 Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); 2515 return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); 2516 } 2517 default: { 2518 // We are scalarizing the instruction. Return the cost of the scalar 2519 // instruction, plus the cost of insert and extract into vector 2520 // elements, times the vector width. 2521 unsigned Cost = 0; 2522 2523 if (!RetTy->isVoidTy() && VF != 1) { 2524 unsigned InsCost = TTI->getVectorInstrCost(Instruction::InsertElement, 2525 VectorTy); 2526 unsigned ExtCost = TTI->getVectorInstrCost(Instruction::ExtractElement, 2527 VectorTy); 2528 2529 // The cost of inserting the results plus extracting each one of the 2530 // operands. 2531 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2532 } 2533 2534 // The cost of executing VF copies of the scalar instruction. This opcode 2535 // is unknown. Assume that it is the same as 'mul'. 2536 Cost += VF * TTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); 2537 return Cost; 2538 } 2539 }// end of switch. 2540} 2541 2542Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2543 if (Scalar->isVoidTy() || VF == 1) 2544 return Scalar; 2545 return VectorType::get(Scalar, VF); 2546} 2547 2548char LoopVectorize::ID = 0; 2549static const char lv_name[] = "Loop Vectorization"; 2550INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2551INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2552INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2553INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2554INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2555 2556namespace llvm { 2557 Pass *createLoopVectorizePass() { 2558 return new LoopVectorize(); 2559 } 2560} 2561 2562 2563