LoopVectorize.cpp revision 6c3074958370bf25dc6e4e4b757f0c083e245dbe
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9#include "LoopVectorize.h" 10#include "llvm/ADT/StringExtras.h" 11#include "llvm/Analysis/AliasAnalysis.h" 12#include "llvm/Analysis/AliasSetTracker.h" 13#include "llvm/Analysis/Dominators.h" 14#include "llvm/Analysis/LoopInfo.h" 15#include "llvm/Analysis/LoopIterator.h" 16#include "llvm/Analysis/LoopPass.h" 17#include "llvm/Analysis/ScalarEvolutionExpander.h" 18#include "llvm/Analysis/ScalarEvolutionExpressions.h" 19#include "llvm/Analysis/ValueTracking.h" 20#include "llvm/Analysis/Verifier.h" 21#include "llvm/Constants.h" 22#include "llvm/DataLayout.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/Function.h" 25#include "llvm/Instructions.h" 26#include "llvm/IntrinsicInst.h" 27#include "llvm/LLVMContext.h" 28#include "llvm/Module.h" 29#include "llvm/Pass.h" 30#include "llvm/Support/CommandLine.h" 31#include "llvm/Support/Debug.h" 32#include "llvm/Support/raw_ostream.h" 33#include "llvm/TargetTransformInfo.h" 34#include "llvm/Transforms/Scalar.h" 35#include "llvm/Transforms/Utils/BasicBlockUtils.h" 36#include "llvm/Transforms/Utils/Local.h" 37#include "llvm/Transforms/Vectorize.h" 38#include "llvm/Type.h" 39#include "llvm/Value.h" 40 41static cl::opt<unsigned> 42VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 43 cl::desc("Sets the SIMD width. Zero is autoselect.")); 44 45static cl::opt<bool> 46EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 47 cl::desc("Enable if-conversion during vectorization.")); 48 49namespace { 50 51/// The LoopVectorize Pass. 52struct LoopVectorize : public LoopPass { 53 /// Pass identification, replacement for typeid 54 static char ID; 55 56 explicit LoopVectorize() : LoopPass(ID) { 57 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 58 } 59 60 ScalarEvolution *SE; 61 DataLayout *DL; 62 LoopInfo *LI; 63 TargetTransformInfo *TTI; 64 DominatorTree *DT; 65 66 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 67 // We only vectorize innermost loops. 68 if (!L->empty()) 69 return false; 70 71 SE = &getAnalysis<ScalarEvolution>(); 72 DL = getAnalysisIfAvailable<DataLayout>(); 73 LI = &getAnalysis<LoopInfo>(); 74 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 75 DT = &getAnalysis<DominatorTree>(); 76 77 DEBUG(dbgs() << "LV: Checking a loop in \"" << 78 L->getHeader()->getParent()->getName() << "\"\n"); 79 80 // Check if it is legal to vectorize the loop. 81 LoopVectorizationLegality LVL(L, SE, DL, DT); 82 if (!LVL.canVectorize()) { 83 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 84 return false; 85 } 86 87 // Select the preffered vectorization factor. 88 const VectorTargetTransformInfo *VTTI = 0; 89 if (TTI) 90 VTTI = TTI->getVectorTargetTransformInfo(); 91 // Use the cost model. 92 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 93 94 // Check the function attribues to find out if this function should be 95 // optimized for size. 96 Function *F = L->getHeader()->getParent(); 97 Attribute::AttrKind SzAttr= Attribute::OptimizeForSize; 98 bool OptForSize = 99 F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, SzAttr); 100 101 unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 102 103 if (VF == 1) { 104 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 105 return false; 106 } 107 108 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 109 F->getParent()->getModuleIdentifier()<<"\n"); 110 111 // If we decided that it is *legal* to vectorizer the loop then do it. 112 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); 113 LB.vectorize(&LVL); 114 115 DEBUG(verifyFunction(*L->getHeader()->getParent())); 116 return true; 117 } 118 119 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 120 LoopPass::getAnalysisUsage(AU); 121 AU.addRequiredID(LoopSimplifyID); 122 AU.addRequiredID(LCSSAID); 123 AU.addRequired<LoopInfo>(); 124 AU.addRequired<ScalarEvolution>(); 125 AU.addRequired<DominatorTree>(); 126 AU.addPreserved<LoopInfo>(); 127 AU.addPreserved<DominatorTree>(); 128 } 129 130}; 131 132}// namespace 133 134//===----------------------------------------------------------------------===// 135// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 136// LoopVectorizationCostModel. 137//===----------------------------------------------------------------------===// 138 139void 140LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 141 Loop *Lp, Value *Ptr) { 142 const SCEV *Sc = SE->getSCEV(Ptr); 143 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 144 assert(AR && "Invalid addrec expression"); 145 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 146 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 147 Pointers.push_back(Ptr); 148 Starts.push_back(AR->getStart()); 149 Ends.push_back(ScEnd); 150} 151 152Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 153 // Save the current insertion location. 154 Instruction *Loc = Builder.GetInsertPoint(); 155 156 // We need to place the broadcast of invariant variables outside the loop. 157 Instruction *Instr = dyn_cast<Instruction>(V); 158 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 159 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 160 161 // Place the code for broadcasting invariant variables in the new preheader. 162 if (Invariant) 163 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 164 165 // Broadcast the scalar into all locations in the vector. 166 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 167 168 // Restore the builder insertion point. 169 if (Invariant) 170 Builder.SetInsertPoint(Loc); 171 172 return Shuf; 173} 174 175Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { 176 assert(Val->getType()->isVectorTy() && "Must be a vector"); 177 assert(Val->getType()->getScalarType()->isIntegerTy() && 178 "Elem must be an integer"); 179 // Create the types. 180 Type *ITy = Val->getType()->getScalarType(); 181 VectorType *Ty = cast<VectorType>(Val->getType()); 182 int VLen = Ty->getNumElements(); 183 SmallVector<Constant*, 8> Indices; 184 185 // Create a vector of consecutive numbers from zero to VF. 186 for (int i = 0; i < VLen; ++i) 187 Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i )); 188 189 // Add the consecutive indices to the vector value. 190 Constant *Cv = ConstantVector::get(Indices); 191 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 192 return Builder.CreateAdd(Val, Cv, "induction"); 193} 194 195int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 196 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 197 198 // If this value is a pointer induction variable we know it is consecutive. 199 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 200 if (Phi && Inductions.count(Phi)) { 201 InductionInfo II = Inductions[Phi]; 202 if (PtrInduction == II.IK) 203 return 1; 204 } 205 206 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 207 if (!Gep) 208 return 0; 209 210 unsigned NumOperands = Gep->getNumOperands(); 211 Value *LastIndex = Gep->getOperand(NumOperands - 1); 212 213 // Check that all of the gep indices are uniform except for the last. 214 for (unsigned i = 0; i < NumOperands - 1; ++i) 215 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 216 return 0; 217 218 // We can emit wide load/stores only if the last index is the induction 219 // variable. 220 const SCEV *Last = SE->getSCEV(LastIndex); 221 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 222 const SCEV *Step = AR->getStepRecurrence(*SE); 223 224 // The memory is consecutive because the last index is consecutive 225 // and all other indices are loop invariant. 226 if (Step->isOne()) 227 return 1; 228 if (Step->isAllOnesValue()) 229 return -1; 230 } 231 232 return 0; 233} 234 235bool LoopVectorizationLegality::isUniform(Value *V) { 236 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 237} 238 239Value *InnerLoopVectorizer::getVectorValue(Value *V) { 240 assert(V != Induction && "The new induction variable should not be used."); 241 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 242 // If we saved a vectorized copy of V, use it. 243 Value *&MapEntry = WidenMap[V]; 244 if (MapEntry) 245 return MapEntry; 246 247 // Broadcast V and save the value for future uses. 248 Value *B = getBroadcastInstrs(V); 249 MapEntry = B; 250 return B; 251} 252 253Constant* 254InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 255 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 256} 257 258Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 259 assert(Vec->getType()->isVectorTy() && "Invalid type"); 260 SmallVector<Constant*, 8> ShuffleMask; 261 for (unsigned i = 0; i < VF; ++i) 262 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 263 264 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 265 ConstantVector::get(ShuffleMask), 266 "reverse"); 267} 268 269void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 270 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 271 // Holds vector parameters or scalars, in case of uniform vals. 272 SmallVector<Value*, 8> Params; 273 274 // Find all of the vectorized parameters. 275 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 276 Value *SrcOp = Instr->getOperand(op); 277 278 // If we are accessing the old induction variable, use the new one. 279 if (SrcOp == OldInduction) { 280 Params.push_back(getVectorValue(SrcOp)); 281 continue; 282 } 283 284 // Try using previously calculated values. 285 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 286 287 // If the src is an instruction that appeared earlier in the basic block 288 // then it should already be vectorized. 289 if (SrcInst && OrigLoop->contains(SrcInst)) { 290 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 291 // The parameter is a vector value from earlier. 292 Params.push_back(WidenMap[SrcInst]); 293 } else { 294 // The parameter is a scalar from outside the loop. Maybe even a constant. 295 Params.push_back(SrcOp); 296 } 297 } 298 299 assert(Params.size() == Instr->getNumOperands() && 300 "Invalid number of operands"); 301 302 // Does this instruction return a value ? 303 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 304 Value *VecResults = 0; 305 306 // If we have a return value, create an empty vector. We place the scalarized 307 // instructions in this vector. 308 if (!IsVoidRetTy) 309 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 310 311 // For each scalar that we create: 312 for (unsigned i = 0; i < VF; ++i) { 313 Instruction *Cloned = Instr->clone(); 314 if (!IsVoidRetTy) 315 Cloned->setName(Instr->getName() + ".cloned"); 316 // Replace the operands of the cloned instrucions with extracted scalars. 317 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 318 Value *Op = Params[op]; 319 // Param is a vector. Need to extract the right lane. 320 if (Op->getType()->isVectorTy()) 321 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 322 Cloned->setOperand(op, Op); 323 } 324 325 // Place the cloned scalar in the new loop. 326 Builder.Insert(Cloned); 327 328 // If the original scalar returns a value we need to place it in a vector 329 // so that future users will be able to use it. 330 if (!IsVoidRetTy) 331 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 332 Builder.getInt32(i)); 333 } 334 335 if (!IsVoidRetTy) 336 WidenMap[Instr] = VecResults; 337} 338 339Value* 340InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 341 Instruction *Loc) { 342 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 343 Legal->getRuntimePointerCheck(); 344 345 if (!PtrRtCheck->Need) 346 return NULL; 347 348 Value *MemoryRuntimeCheck = 0; 349 unsigned NumPointers = PtrRtCheck->Pointers.size(); 350 SmallVector<Value* , 2> Starts; 351 SmallVector<Value* , 2> Ends; 352 353 SCEVExpander Exp(*SE, "induction"); 354 355 // Use this type for pointer arithmetic. 356 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 357 358 for (unsigned i = 0; i < NumPointers; ++i) { 359 Value *Ptr = PtrRtCheck->Pointers[i]; 360 const SCEV *Sc = SE->getSCEV(Ptr); 361 362 if (SE->isLoopInvariant(Sc, OrigLoop)) { 363 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 364 *Ptr <<"\n"); 365 Starts.push_back(Ptr); 366 Ends.push_back(Ptr); 367 } else { 368 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 369 370 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 371 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 372 Starts.push_back(Start); 373 Ends.push_back(End); 374 } 375 } 376 377 for (unsigned i = 0; i < NumPointers; ++i) { 378 for (unsigned j = i+1; j < NumPointers; ++j) { 379 Instruction::CastOps Op = Instruction::BitCast; 380 Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); 381 Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); 382 Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); 383 Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); 384 385 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 386 Start0, End1, "bound0", Loc); 387 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 388 Start1, End0, "bound1", Loc); 389 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 390 "found.conflict", Loc); 391 if (MemoryRuntimeCheck) 392 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 393 MemoryRuntimeCheck, 394 IsConflict, 395 "conflict.rdx", Loc); 396 else 397 MemoryRuntimeCheck = IsConflict; 398 399 } 400 } 401 402 return MemoryRuntimeCheck; 403} 404 405void 406InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 407 /* 408 In this function we generate a new loop. The new loop will contain 409 the vectorized instructions while the old loop will continue to run the 410 scalar remainder. 411 412 [ ] <-- vector loop bypass. 413 / | 414 / v 415 | [ ] <-- vector pre header. 416 | | 417 | v 418 | [ ] \ 419 | [ ]_| <-- vector loop. 420 | | 421 \ v 422 >[ ] <--- middle-block. 423 / | 424 / v 425 | [ ] <--- new preheader. 426 | | 427 | v 428 | [ ] \ 429 | [ ]_| <-- old scalar loop to handle remainder. 430 \ | 431 \ v 432 >[ ] <-- exit block. 433 ... 434 */ 435 436 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 437 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 438 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 439 assert(ExitBlock && "Must have an exit block"); 440 441 // Some loops have a single integer induction variable, while other loops 442 // don't. One example is c++ iterators that often have multiple pointer 443 // induction variables. In the code below we also support a case where we 444 // don't have a single induction variable. 445 OldInduction = Legal->getInduction(); 446 Type *IdxTy = OldInduction ? OldInduction->getType() : 447 DL->getIntPtrType(SE->getContext()); 448 449 // Find the loop boundaries. 450 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 451 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 452 453 // Get the total trip count from the count by adding 1. 454 ExitCount = SE->getAddExpr(ExitCount, 455 SE->getConstant(ExitCount->getType(), 1)); 456 457 // Expand the trip count and place the new instructions in the preheader. 458 // Notice that the pre-header does not change, only the loop body. 459 SCEVExpander Exp(*SE, "induction"); 460 461 // Count holds the overall loop count (N). 462 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 463 BypassBlock->getTerminator()); 464 465 // The loop index does not have to start at Zero. Find the original start 466 // value from the induction PHI node. If we don't have an induction variable 467 // then we know that it starts at zero. 468 Value *StartIdx = OldInduction ? 469 OldInduction->getIncomingValueForBlock(BypassBlock): 470 ConstantInt::get(IdxTy, 0); 471 472 assert(BypassBlock && "Invalid loop structure"); 473 474 // Generate the code that checks in runtime if arrays overlap. 475 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 476 BypassBlock->getTerminator()); 477 478 // Split the single block loop into the two loop structure described above. 479 BasicBlock *VectorPH = 480 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 481 BasicBlock *VecBody = 482 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 483 BasicBlock *MiddleBlock = 484 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 485 BasicBlock *ScalarPH = 486 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 487 488 // This is the location in which we add all of the logic for bypassing 489 // the new vector loop. 490 Instruction *Loc = BypassBlock->getTerminator(); 491 492 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 493 // inside the loop. 494 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 495 496 // Generate the induction variable. 497 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 498 Constant *Step = ConstantInt::get(IdxTy, VF); 499 500 // We may need to extend the index in case there is a type mismatch. 501 // We know that the count starts at zero and does not overflow. 502 if (Count->getType() != IdxTy) { 503 // The exit count can be of pointer type. Convert it to the correct 504 // integer type. 505 if (ExitCount->getType()->isPointerTy()) 506 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 507 else 508 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 509 } 510 511 // Add the start index to the loop count to get the new end index. 512 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 513 514 // Now we need to generate the expression for N - (N % VF), which is 515 // the part that the vectorized body will execute. 516 Constant *CIVF = ConstantInt::get(IdxTy, VF); 517 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 518 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 519 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 520 "end.idx.rnd.down", Loc); 521 522 // Now, compare the new count to zero. If it is zero skip the vector loop and 523 // jump to the scalar loop. 524 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 525 IdxEndRoundDown, 526 StartIdx, 527 "cmp.zero", Loc); 528 529 // If we are using memory runtime checks, include them in. 530 if (MemoryRuntimeCheck) 531 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 532 "CntOrMem", Loc); 533 534 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 535 // Remove the old terminator. 536 Loc->eraseFromParent(); 537 538 // We are going to resume the execution of the scalar loop. 539 // Go over all of the induction variables that we found and fix the 540 // PHIs that are left in the scalar version of the loop. 541 // The starting values of PHI nodes depend on the counter of the last 542 // iteration in the vectorized loop. 543 // If we come from a bypass edge then we need to start from the original 544 // start value. 545 546 // This variable saves the new starting index for the scalar loop. 547 PHINode *ResumeIndex = 0; 548 LoopVectorizationLegality::InductionList::iterator I, E; 549 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 550 for (I = List->begin(), E = List->end(); I != E; ++I) { 551 PHINode *OrigPhi = I->first; 552 LoopVectorizationLegality::InductionInfo II = I->second; 553 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 554 MiddleBlock->getTerminator()); 555 Value *EndValue = 0; 556 switch (II.IK) { 557 case LoopVectorizationLegality::NoInduction: 558 llvm_unreachable("Unknown induction"); 559 case LoopVectorizationLegality::IntInduction: { 560 // Handle the integer induction counter: 561 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 562 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 563 // We know what the end value is. 564 EndValue = IdxEndRoundDown; 565 // We also know which PHI node holds it. 566 ResumeIndex = ResumeVal; 567 break; 568 } 569 case LoopVectorizationLegality::ReverseIntInduction: { 570 // Convert the CountRoundDown variable to the PHI size. 571 unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); 572 unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); 573 Value *CRD = CountRoundDown; 574 if (CRDSize > IISize) 575 CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, 576 II.StartValue->getType(), 577 "tr.crd", BypassBlock->getTerminator()); 578 else if (CRDSize < IISize) 579 CRD = CastInst::Create(Instruction::SExt, CountRoundDown, 580 II.StartValue->getType(), 581 "sext.crd", BypassBlock->getTerminator()); 582 // Handle reverse integer induction counter: 583 EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", 584 BypassBlock->getTerminator()); 585 break; 586 } 587 case LoopVectorizationLegality::PtrInduction: { 588 // For pointer induction variables, calculate the offset using 589 // the end index. 590 EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, 591 "ptr.ind.end", 592 BypassBlock->getTerminator()); 593 break; 594 } 595 }// end of case 596 597 // The new PHI merges the original incoming value, in case of a bypass, 598 // or the value at the end of the vectorized loop. 599 ResumeVal->addIncoming(II.StartValue, BypassBlock); 600 ResumeVal->addIncoming(EndValue, VecBody); 601 602 // Fix the scalar body counter (PHI node). 603 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 604 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 605 } 606 607 // If we are generating a new induction variable then we also need to 608 // generate the code that calculates the exit value. This value is not 609 // simply the end of the counter because we may skip the vectorized body 610 // in case of a runtime check. 611 if (!OldInduction){ 612 assert(!ResumeIndex && "Unexpected resume value found"); 613 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 614 MiddleBlock->getTerminator()); 615 ResumeIndex->addIncoming(StartIdx, BypassBlock); 616 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 617 } 618 619 // Make sure that we found the index where scalar loop needs to continue. 620 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 621 "Invalid resume Index"); 622 623 // Add a check in the middle block to see if we have completed 624 // all of the iterations in the first vector loop. 625 // If (N - N%VF) == N, then we *don't* need to run the remainder. 626 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 627 ResumeIndex, "cmp.n", 628 MiddleBlock->getTerminator()); 629 630 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 631 // Remove the old terminator. 632 MiddleBlock->getTerminator()->eraseFromParent(); 633 634 // Create i+1 and fill the PHINode. 635 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 636 Induction->addIncoming(StartIdx, VectorPH); 637 Induction->addIncoming(NextIdx, VecBody); 638 // Create the compare. 639 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 640 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 641 642 // Now we have two terminators. Remove the old one from the block. 643 VecBody->getTerminator()->eraseFromParent(); 644 645 // Get ready to start creating new instructions into the vectorized body. 646 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 647 648 // Create and register the new vector loop. 649 Loop* Lp = new Loop(); 650 Loop *ParentLoop = OrigLoop->getParentLoop(); 651 652 // Insert the new loop into the loop nest and register the new basic blocks. 653 if (ParentLoop) { 654 ParentLoop->addChildLoop(Lp); 655 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 656 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 657 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 658 } else { 659 LI->addTopLevelLoop(Lp); 660 } 661 662 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 663 664 // Save the state. 665 LoopVectorPreHeader = VectorPH; 666 LoopScalarPreHeader = ScalarPH; 667 LoopMiddleBlock = MiddleBlock; 668 LoopExitBlock = ExitBlock; 669 LoopVectorBody = VecBody; 670 LoopScalarBody = OldBasicBlock; 671 LoopBypassBlock = BypassBlock; 672} 673 674/// This function returns the identity element (or neutral element) for 675/// the operation K. 676static unsigned 677getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 678 switch (K) { 679 case LoopVectorizationLegality::IntegerXor: 680 case LoopVectorizationLegality::IntegerAdd: 681 case LoopVectorizationLegality::IntegerOr: 682 // Adding, Xoring, Oring zero to a number does not change it. 683 return 0; 684 case LoopVectorizationLegality::IntegerMult: 685 // Multiplying a number by 1 does not change it. 686 return 1; 687 case LoopVectorizationLegality::IntegerAnd: 688 // AND-ing a number with an all-1 value does not change it. 689 return -1; 690 default: 691 llvm_unreachable("Unknown reduction kind"); 692 } 693} 694 695static bool 696isTriviallyVectorizableIntrinsic(Instruction *Inst) { 697 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 698 if (!II) 699 return false; 700 switch (II->getIntrinsicID()) { 701 case Intrinsic::sqrt: 702 case Intrinsic::sin: 703 case Intrinsic::cos: 704 case Intrinsic::exp: 705 case Intrinsic::exp2: 706 case Intrinsic::log: 707 case Intrinsic::log10: 708 case Intrinsic::log2: 709 case Intrinsic::fabs: 710 case Intrinsic::floor: 711 case Intrinsic::ceil: 712 case Intrinsic::trunc: 713 case Intrinsic::rint: 714 case Intrinsic::nearbyint: 715 case Intrinsic::pow: 716 case Intrinsic::fma: 717 case Intrinsic::fmuladd: 718 return true; 719 default: 720 return false; 721 } 722 return false; 723} 724 725void 726InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 727 //===------------------------------------------------===// 728 // 729 // Notice: any optimization or new instruction that go 730 // into the code below should be also be implemented in 731 // the cost-model. 732 // 733 //===------------------------------------------------===// 734 BasicBlock &BB = *OrigLoop->getHeader(); 735 Constant *Zero = 736 ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); 737 738 // In order to support reduction variables we need to be able to vectorize 739 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 740 // stages. First, we create a new vector PHI node with no incoming edges. 741 // We use this value when we vectorize all of the instructions that use the 742 // PHI. Next, after all of the instructions in the block are complete we 743 // add the new incoming edges to the PHI. At this point all of the 744 // instructions in the basic block are vectorized, so we can use them to 745 // construct the PHI. 746 PhiVector RdxPHIsToFix; 747 748 // Scan the loop in a topological order to ensure that defs are vectorized 749 // before users. 750 LoopBlocksDFS DFS(OrigLoop); 751 DFS.perform(LI); 752 753 // Vectorize all of the blocks in the original loop. 754 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 755 be = DFS.endRPO(); bb != be; ++bb) 756 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 757 758 // At this point every instruction in the original loop is widened to 759 // a vector form. We are almost done. Now, we need to fix the PHI nodes 760 // that we vectorized. The PHI nodes are currently empty because we did 761 // not want to introduce cycles. Notice that the remaining PHI nodes 762 // that we need to fix are reduction variables. 763 764 // Create the 'reduced' values for each of the induction vars. 765 // The reduced values are the vector values that we scalarize and combine 766 // after the loop is finished. 767 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 768 it != e; ++it) { 769 PHINode *RdxPhi = *it; 770 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 771 assert(RdxPhi && "Unable to recover vectorized PHI"); 772 773 // Find the reduction variable descriptor. 774 assert(Legal->getReductionVars()->count(RdxPhi) && 775 "Unable to find the reduction variable"); 776 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 777 (*Legal->getReductionVars())[RdxPhi]; 778 779 // We need to generate a reduction vector from the incoming scalar. 780 // To do so, we need to generate the 'identity' vector and overide 781 // one of the elements with the incoming scalar reduction. We need 782 // to do it in the vector-loop preheader. 783 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 784 785 // This is the vector-clone of the value that leaves the loop. 786 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 787 Type *VecTy = VectorExit->getType(); 788 789 // Find the reduction identity variable. Zero for addition, or, xor, 790 // one for multiplication, -1 for And. 791 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 792 VecTy->getScalarType()); 793 794 // This vector is the Identity vector where the first element is the 795 // incoming scalar reduction. 796 Value *VectorStart = Builder.CreateInsertElement(Identity, 797 RdxDesc.StartValue, Zero); 798 799 // Fix the vector-loop phi. 800 // We created the induction variable so we know that the 801 // preheader is the first entry. 802 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 803 804 // Reductions do not have to start at zero. They can start with 805 // any loop invariant values. 806 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 807 Value *Val = 808 getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 809 VecRdxPhi->addIncoming(Val, LoopVectorBody); 810 811 // Before each round, move the insertion point right between 812 // the PHIs and the values we are going to write. 813 // This allows us to write both PHINodes and the extractelement 814 // instructions. 815 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 816 817 // This PHINode contains the vectorized reduction variable, or 818 // the initial value vector, if we bypass the vector loop. 819 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 820 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 821 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 822 823 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 824 // and vector ops, reducing the set of values being computed by half each 825 // round. 826 assert(isPowerOf2_32(VF) && 827 "Reduction emission only supported for pow2 vectors!"); 828 Value *TmpVec = NewPhi; 829 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 830 for (unsigned i = VF; i != 1; i >>= 1) { 831 // Move the upper half of the vector to the lower half. 832 for (unsigned j = 0; j != i/2; ++j) 833 ShuffleMask[j] = Builder.getInt32(i/2 + j); 834 835 // Fill the rest of the mask with undef. 836 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 837 UndefValue::get(Builder.getInt32Ty())); 838 839 Value *Shuf = 840 Builder.CreateShuffleVector(TmpVec, 841 UndefValue::get(TmpVec->getType()), 842 ConstantVector::get(ShuffleMask), 843 "rdx.shuf"); 844 845 // Emit the operation on the shuffled value. 846 switch (RdxDesc.Kind) { 847 case LoopVectorizationLegality::IntegerAdd: 848 TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); 849 break; 850 case LoopVectorizationLegality::IntegerMult: 851 TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); 852 break; 853 case LoopVectorizationLegality::IntegerOr: 854 TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); 855 break; 856 case LoopVectorizationLegality::IntegerAnd: 857 TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); 858 break; 859 case LoopVectorizationLegality::IntegerXor: 860 TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); 861 break; 862 default: 863 llvm_unreachable("Unknown reduction operation"); 864 } 865 } 866 867 // The result is in the first element of the vector. 868 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 869 870 // Now, we need to fix the users of the reduction variable 871 // inside and outside of the scalar remainder loop. 872 // We know that the loop is in LCSSA form. We need to update the 873 // PHI nodes in the exit blocks. 874 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 875 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 876 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 877 if (!LCSSAPhi) continue; 878 879 // All PHINodes need to have a single entry edge, or two if 880 // we already fixed them. 881 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 882 883 // We found our reduction value exit-PHI. Update it with the 884 // incoming bypass edge. 885 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 886 // Add an edge coming from the bypass. 887 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 888 break; 889 } 890 }// end of the LCSSA phi scan. 891 892 // Fix the scalar loop reduction variable with the incoming reduction sum 893 // from the vector body and from the backedge value. 894 int IncomingEdgeBlockIdx = 895 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 896 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 897 // Pick the other block. 898 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 899 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 900 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 901 }// end of for each redux variable. 902 903 // The Loop exit block may have single value PHI nodes where the incoming 904 // value is 'undef'. While vectorizing we only handled real values that 905 // were defined inside the loop. Here we handle the 'undef case'. 906 // See PR14725. 907 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 908 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 909 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 910 if (!LCSSAPhi) continue; 911 if (LCSSAPhi->getNumIncomingValues() == 1) 912 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 913 LoopMiddleBlock); 914 } 915} 916 917Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 918 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 919 "Invalid edge"); 920 921 Value *SrcMask = createBlockInMask(Src); 922 923 // The terminator has to be a branch inst! 924 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 925 assert(BI && "Unexpected terminator found"); 926 927 Value *EdgeMask = SrcMask; 928 if (BI->isConditional()) { 929 EdgeMask = getVectorValue(BI->getCondition()); 930 if (BI->getSuccessor(0) != Dst) 931 EdgeMask = Builder.CreateNot(EdgeMask); 932 } 933 934 return Builder.CreateAnd(EdgeMask, SrcMask); 935} 936 937Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 938 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 939 940 // Loop incoming mask is all-one. 941 if (OrigLoop->getHeader() == BB) { 942 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 943 return getVectorValue(C); 944 } 945 946 // This is the block mask. We OR all incoming edges, and with zero. 947 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 948 Value *BlockMask = getVectorValue(Zero); 949 950 // For each pred: 951 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) 952 BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); 953 954 return BlockMask; 955} 956 957void 958InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 959 BasicBlock *BB, PhiVector *PV) { 960 Constant *Zero = Builder.getInt32(0); 961 962 // For each instruction in the old loop. 963 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 964 switch (it->getOpcode()) { 965 case Instruction::Br: 966 // Nothing to do for PHIs and BR, since we already took care of the 967 // loop control flow instructions. 968 continue; 969 case Instruction::PHI:{ 970 PHINode* P = cast<PHINode>(it); 971 // Handle reduction variables: 972 if (Legal->getReductionVars()->count(P)) { 973 // This is phase one of vectorizing PHIs. 974 Type *VecTy = VectorType::get(it->getType(), VF); 975 WidenMap[it] = 976 PHINode::Create(VecTy, 2, "vec.phi", 977 LoopVectorBody->getFirstInsertionPt()); 978 PV->push_back(P); 979 continue; 980 } 981 982 // Check for PHI nodes that are lowered to vector selects. 983 if (P->getParent() != OrigLoop->getHeader()) { 984 // We know that all PHIs in non header blocks are converted into 985 // selects, so we don't have to worry about the insertion order and we 986 // can just use the builder. 987 988 // At this point we generate the predication tree. There may be 989 // duplications since this is a simple recursive scan, but future 990 // optimizations will clean it up. 991 Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent()); 992 WidenMap[P] = 993 Builder.CreateSelect(Cond, 994 getVectorValue(P->getIncomingValue(0)), 995 getVectorValue(P->getIncomingValue(1)), 996 "predphi"); 997 continue; 998 } 999 1000 // This PHINode must be an induction variable. 1001 // Make sure that we know about it. 1002 assert(Legal->getInductionVars()->count(P) && 1003 "Not an induction variable"); 1004 1005 LoopVectorizationLegality::InductionInfo II = 1006 Legal->getInductionVars()->lookup(P); 1007 1008 switch (II.IK) { 1009 case LoopVectorizationLegality::NoInduction: 1010 llvm_unreachable("Unknown induction"); 1011 case LoopVectorizationLegality::IntInduction: { 1012 assert(P == OldInduction && "Unexpected PHI"); 1013 Value *Broadcasted = getBroadcastInstrs(Induction); 1014 // After broadcasting the induction variable we need to make the 1015 // vector consecutive by adding 0, 1, 2 ... 1016 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); 1017 WidenMap[OldInduction] = ConsecutiveInduction; 1018 continue; 1019 } 1020 case LoopVectorizationLegality::ReverseIntInduction: 1021 case LoopVectorizationLegality::PtrInduction: 1022 // Handle reverse integer and pointer inductions. 1023 Value *StartIdx = 0; 1024 // If we have a single integer induction variable then use it. 1025 // Otherwise, start counting at zero. 1026 if (OldInduction) { 1027 LoopVectorizationLegality::InductionInfo OldII = 1028 Legal->getInductionVars()->lookup(OldInduction); 1029 StartIdx = OldII.StartValue; 1030 } else { 1031 StartIdx = ConstantInt::get(Induction->getType(), 0); 1032 } 1033 // This is the normalized GEP that starts counting at zero. 1034 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1035 "normalized.idx"); 1036 1037 // Handle the reverse integer induction variable case. 1038 if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { 1039 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 1040 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 1041 "resize.norm.idx"); 1042 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 1043 "reverse.idx"); 1044 1045 // This is a new value so do not hoist it out. 1046 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 1047 // After broadcasting the induction variable we need to make the 1048 // vector consecutive by adding ... -3, -2, -1, 0. 1049 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, 1050 true); 1051 WidenMap[it] = ConsecutiveInduction; 1052 continue; 1053 } 1054 1055 // Handle the pointer induction variable case. 1056 assert(P->getType()->isPointerTy() && "Unexpected type."); 1057 1058 // This is the vector of results. Notice that we don't generate 1059 // vector geps because scalar geps result in better code. 1060 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1061 for (unsigned int i = 0; i < VF; ++i) { 1062 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1063 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, 1064 "gep.idx"); 1065 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 1066 "next.gep"); 1067 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1068 Builder.getInt32(i), 1069 "insert.gep"); 1070 } 1071 1072 WidenMap[it] = VecVal; 1073 continue; 1074 } 1075 1076 }// End of PHI. 1077 1078 case Instruction::Add: 1079 case Instruction::FAdd: 1080 case Instruction::Sub: 1081 case Instruction::FSub: 1082 case Instruction::Mul: 1083 case Instruction::FMul: 1084 case Instruction::UDiv: 1085 case Instruction::SDiv: 1086 case Instruction::FDiv: 1087 case Instruction::URem: 1088 case Instruction::SRem: 1089 case Instruction::FRem: 1090 case Instruction::Shl: 1091 case Instruction::LShr: 1092 case Instruction::AShr: 1093 case Instruction::And: 1094 case Instruction::Or: 1095 case Instruction::Xor: { 1096 // Just widen binops. 1097 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 1098 Value *A = getVectorValue(it->getOperand(0)); 1099 Value *B = getVectorValue(it->getOperand(1)); 1100 1101 // Use this vector value for all users of the original instruction. 1102 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1103 WidenMap[it] = V; 1104 1105 // Update the NSW, NUW and Exact flags. 1106 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1107 if (isa<OverflowingBinaryOperator>(BinOp)) { 1108 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1109 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1110 } 1111 if (isa<PossiblyExactOperator>(VecOp)) 1112 VecOp->setIsExact(BinOp->isExact()); 1113 break; 1114 } 1115 case Instruction::Select: { 1116 // Widen selects. 1117 // If the selector is loop invariant we can create a select 1118 // instruction with a scalar condition. Otherwise, use vector-select. 1119 Value *Cond = it->getOperand(0); 1120 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1121 1122 // The condition can be loop invariant but still defined inside the 1123 // loop. This means that we can't just use the original 'cond' value. 1124 // We have to take the 'vectorized' value and pick the first lane. 1125 // Instcombine will make this a no-op. 1126 Cond = getVectorValue(Cond); 1127 if (InvariantCond) 1128 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1129 1130 Value *Op0 = getVectorValue(it->getOperand(1)); 1131 Value *Op1 = getVectorValue(it->getOperand(2)); 1132 WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); 1133 break; 1134 } 1135 1136 case Instruction::ICmp: 1137 case Instruction::FCmp: { 1138 // Widen compares. Generate vector compares. 1139 bool FCmp = (it->getOpcode() == Instruction::FCmp); 1140 CmpInst *Cmp = dyn_cast<CmpInst>(it); 1141 Value *A = getVectorValue(it->getOperand(0)); 1142 Value *B = getVectorValue(it->getOperand(1)); 1143 if (FCmp) 1144 WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1145 else 1146 WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1147 break; 1148 } 1149 1150 case Instruction::Store: { 1151 // Attempt to issue a wide store. 1152 StoreInst *SI = dyn_cast<StoreInst>(it); 1153 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1154 Value *Ptr = SI->getPointerOperand(); 1155 unsigned Alignment = SI->getAlignment(); 1156 1157 assert(!Legal->isUniform(Ptr) && 1158 "We do not allow storing to uniform addresses"); 1159 1160 1161 int Stride = Legal->isConsecutivePtr(Ptr); 1162 bool Reverse = Stride < 0; 1163 if (Stride == 0) { 1164 scalarizeInstruction(it); 1165 break; 1166 } 1167 1168 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1169 if (Gep) { 1170 // The last index does not have to be the induction. It can be 1171 // consecutive and be a function of the index. For example A[I+1]; 1172 unsigned NumOperands = Gep->getNumOperands(); 1173 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1174 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1175 1176 // Create the new GEP with the new induction variable. 1177 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1178 Gep2->setOperand(NumOperands - 1, LastIndex); 1179 Ptr = Builder.Insert(Gep2); 1180 } else { 1181 // Use the induction element ptr. 1182 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1183 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1184 } 1185 1186 // If the address is consecutive but reversed, then the 1187 // wide load needs to start at the last vector element. 1188 if (Reverse) 1189 Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); 1190 1191 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1192 Value *Val = getVectorValue(SI->getValueOperand()); 1193 if (Reverse) 1194 Val = reverseVector(Val); 1195 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1196 break; 1197 } 1198 case Instruction::Load: { 1199 // Attempt to issue a wide load. 1200 LoadInst *LI = dyn_cast<LoadInst>(it); 1201 Type *RetTy = VectorType::get(LI->getType(), VF); 1202 Value *Ptr = LI->getPointerOperand(); 1203 unsigned Alignment = LI->getAlignment(); 1204 1205 // If the pointer is loop invariant or if it is non consecutive, 1206 // scalarize the load. 1207 int Stride = Legal->isConsecutivePtr(Ptr); 1208 bool Reverse = Stride < 0; 1209 if (Legal->isUniform(Ptr) || Stride == 0) { 1210 scalarizeInstruction(it); 1211 break; 1212 } 1213 1214 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1215 if (Gep) { 1216 // The last index does not have to be the induction. It can be 1217 // consecutive and be a function of the index. For example A[I+1]; 1218 unsigned NumOperands = Gep->getNumOperands(); 1219 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1220 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1221 1222 // Create the new GEP with the new induction variable. 1223 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1224 Gep2->setOperand(NumOperands - 1, LastIndex); 1225 Ptr = Builder.Insert(Gep2); 1226 } else { 1227 // Use the induction element ptr. 1228 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1229 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1230 } 1231 // If the address is consecutive but reversed, then the 1232 // wide load needs to start at the last vector element. 1233 if (Reverse) 1234 Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); 1235 1236 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1237 LI = Builder.CreateLoad(Ptr); 1238 LI->setAlignment(Alignment); 1239 1240 // Use this vector value for all users of the load. 1241 WidenMap[it] = Reverse ? reverseVector(LI) : LI; 1242 break; 1243 } 1244 case Instruction::ZExt: 1245 case Instruction::SExt: 1246 case Instruction::FPToUI: 1247 case Instruction::FPToSI: 1248 case Instruction::FPExt: 1249 case Instruction::PtrToInt: 1250 case Instruction::IntToPtr: 1251 case Instruction::SIToFP: 1252 case Instruction::UIToFP: 1253 case Instruction::Trunc: 1254 case Instruction::FPTrunc: 1255 case Instruction::BitCast: { 1256 CastInst *CI = dyn_cast<CastInst>(it); 1257 /// Optimize the special case where the source is the induction 1258 /// variable. Notice that we can only optimize the 'trunc' case 1259 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 1260 /// c. other casts depend on pointer size. 1261 if (CI->getOperand(0) == OldInduction && 1262 it->getOpcode() == Instruction::Trunc) { 1263 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 1264 CI->getType()); 1265 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 1266 WidenMap[it] = getConsecutiveVector(Broadcasted); 1267 break; 1268 } 1269 /// Vectorize casts. 1270 Value *A = getVectorValue(it->getOperand(0)); 1271 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1272 WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1273 break; 1274 } 1275 1276 case Instruction::Call: { 1277 assert(isTriviallyVectorizableIntrinsic(it)); 1278 Module *M = BB->getParent()->getParent(); 1279 IntrinsicInst *II = cast<IntrinsicInst>(it); 1280 Intrinsic::ID ID = II->getIntrinsicID(); 1281 SmallVector<Value*, 4> Args; 1282 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 1283 Args.push_back(getVectorValue(II->getArgOperand(i))); 1284 Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; 1285 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 1286 WidenMap[it] = Builder.CreateCall(F, Args); 1287 break; 1288 } 1289 1290 default: 1291 // All other instructions are unsupported. Scalarize them. 1292 scalarizeInstruction(it); 1293 break; 1294 }// end of switch. 1295 }// end of for_each instr. 1296} 1297 1298void InnerLoopVectorizer::updateAnalysis() { 1299 // Forget the original basic block. 1300 SE->forgetLoop(OrigLoop); 1301 1302 // Update the dominator tree information. 1303 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1304 "Entry does not dominate exit."); 1305 1306 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1307 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1308 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1309 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1310 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1311 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1312 1313 DEBUG(DT->verifyAnalysis()); 1314} 1315 1316bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1317 if (!EnableIfConversion) 1318 return false; 1319 1320 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1321 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 1322 1323 // Collect the blocks that need predication. 1324 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 1325 BasicBlock *BB = LoopBlocks[i]; 1326 1327 // We don't support switch statements inside loops. 1328 if (!isa<BranchInst>(BB->getTerminator())) 1329 return false; 1330 1331 // We must have at most two predecessors because we need to convert 1332 // all PHIs to selects. 1333 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 1334 if (Preds > 2) 1335 return false; 1336 1337 // We must be able to predicate all blocks that need to be predicated. 1338 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 1339 return false; 1340 } 1341 1342 // We can if-convert this loop. 1343 return true; 1344} 1345 1346bool LoopVectorizationLegality::canVectorize() { 1347 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 1348 1349 // We can only vectorize innermost loops. 1350 if (TheLoop->getSubLoopsVector().size()) 1351 return false; 1352 1353 // We must have a single backedge. 1354 if (TheLoop->getNumBackEdges() != 1) 1355 return false; 1356 1357 // We must have a single exiting block. 1358 if (!TheLoop->getExitingBlock()) 1359 return false; 1360 1361 unsigned NumBlocks = TheLoop->getNumBlocks(); 1362 1363 // Check if we can if-convert non single-bb loops. 1364 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1365 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1366 return false; 1367 } 1368 1369 // We need to have a loop header. 1370 BasicBlock *Latch = TheLoop->getLoopLatch(); 1371 DEBUG(dbgs() << "LV: Found a loop: " << 1372 TheLoop->getHeader()->getName() << "\n"); 1373 1374 // ScalarEvolution needs to be able to find the exit count. 1375 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 1376 if (ExitCount == SE->getCouldNotCompute()) { 1377 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1378 return false; 1379 } 1380 1381 // Do not loop-vectorize loops with a tiny trip count. 1382 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 1383 if (TC > 0u && TC < TinyTripCountThreshold) { 1384 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1385 "This loop is not worth vectorizing.\n"); 1386 return false; 1387 } 1388 1389 // Check if we can vectorize the instructions and CFG in this loop. 1390 if (!canVectorizeInstrs()) { 1391 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1392 return false; 1393 } 1394 1395 // Go over each instruction and look at memory deps. 1396 if (!canVectorizeMemory()) { 1397 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1398 return false; 1399 } 1400 1401 // Collect all of the variables that remain uniform after vectorization. 1402 collectLoopUniforms(); 1403 1404 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1405 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1406 <<"!\n"); 1407 1408 // Okay! We can vectorize. At this point we don't have any other mem analysis 1409 // which may limit our maximum vectorization factor, so just return true with 1410 // no restrictions. 1411 return true; 1412} 1413 1414bool LoopVectorizationLegality::canVectorizeInstrs() { 1415 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1416 BasicBlock *Header = TheLoop->getHeader(); 1417 1418 // For each block in the loop. 1419 for (Loop::block_iterator bb = TheLoop->block_begin(), 1420 be = TheLoop->block_end(); bb != be; ++bb) { 1421 1422 // Scan the instructions in the block and look for hazards. 1423 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1424 ++it) { 1425 1426 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 1427 // This should not happen because the loop should be normalized. 1428 if (Phi->getNumIncomingValues() != 2) { 1429 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1430 return false; 1431 } 1432 1433 // Check that this PHI type is allowed. 1434 if (!Phi->getType()->isIntegerTy() && 1435 !Phi->getType()->isPointerTy()) { 1436 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1437 return false; 1438 } 1439 1440 // If this PHINode is not in the header block, then we know that we 1441 // can convert it to select during if-conversion. No need to check if 1442 // the PHIs in this block are induction or reduction variables. 1443 if (*bb != Header) 1444 continue; 1445 1446 // This is the value coming from the preheader. 1447 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1448 // Check if this is an induction variable. 1449 InductionKind IK = isInductionVariable(Phi); 1450 1451 if (NoInduction != IK) { 1452 // Int inductions are special because we only allow one IV. 1453 if (IK == IntInduction) { 1454 if (Induction) { 1455 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1456 return false; 1457 } 1458 Induction = Phi; 1459 } 1460 1461 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 1462 Inductions[Phi] = InductionInfo(StartValue, IK); 1463 continue; 1464 } 1465 1466 if (AddReductionVar(Phi, IntegerAdd)) { 1467 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1468 continue; 1469 } 1470 if (AddReductionVar(Phi, IntegerMult)) { 1471 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1472 continue; 1473 } 1474 if (AddReductionVar(Phi, IntegerOr)) { 1475 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1476 continue; 1477 } 1478 if (AddReductionVar(Phi, IntegerAnd)) { 1479 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1480 continue; 1481 } 1482 if (AddReductionVar(Phi, IntegerXor)) { 1483 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1484 continue; 1485 } 1486 1487 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1488 return false; 1489 }// end of PHI handling 1490 1491 // We still don't handle functions. 1492 CallInst *CI = dyn_cast<CallInst>(it); 1493 if (CI && !isTriviallyVectorizableIntrinsic(it)) { 1494 DEBUG(dbgs() << "LV: Found a call site.\n"); 1495 return false; 1496 } 1497 1498 // Check that the instruction return type is vectorizable. 1499 if (!VectorType::isValidElementType(it->getType()) && 1500 !it->getType()->isVoidTy()) { 1501 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1502 return false; 1503 } 1504 1505 // Check that the stored type is vectorizable. 1506 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 1507 Type *T = ST->getValueOperand()->getType(); 1508 if (!VectorType::isValidElementType(T)) 1509 return false; 1510 } 1511 1512 // Reduction instructions are allowed to have exit users. 1513 // All other instructions must not have external users. 1514 if (!AllowedExit.count(it)) 1515 //Check that all of the users of the loop are inside the BB. 1516 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 1517 I != E; ++I) { 1518 Instruction *U = cast<Instruction>(*I); 1519 // This user may be a reduction exit value. 1520 if (!TheLoop->contains(U)) { 1521 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1522 return false; 1523 } 1524 } 1525 } // next instr. 1526 1527 } 1528 1529 if (!Induction) { 1530 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1531 assert(getInductionVars()->size() && "No induction variables"); 1532 } 1533 1534 return true; 1535} 1536 1537void LoopVectorizationLegality::collectLoopUniforms() { 1538 // We now know that the loop is vectorizable! 1539 // Collect variables that will remain uniform after vectorization. 1540 std::vector<Value*> Worklist; 1541 BasicBlock *Latch = TheLoop->getLoopLatch(); 1542 1543 // Start with the conditional branch and walk up the block. 1544 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 1545 1546 while (Worklist.size()) { 1547 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1548 Worklist.pop_back(); 1549 1550 // Look at instructions inside this loop. 1551 // Stop when reaching PHI nodes. 1552 // TODO: we need to follow values all over the loop, not only in this block. 1553 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 1554 continue; 1555 1556 // This is a known uniform. 1557 Uniforms.insert(I); 1558 1559 // Insert all operands. 1560 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 1561 Worklist.push_back(I->getOperand(i)); 1562 } 1563 } 1564} 1565 1566bool LoopVectorizationLegality::canVectorizeMemory() { 1567 typedef SmallVector<Value*, 16> ValueVector; 1568 typedef SmallPtrSet<Value*, 16> ValueSet; 1569 // Holds the Load and Store *instructions*. 1570 ValueVector Loads; 1571 ValueVector Stores; 1572 PtrRtCheck.Pointers.clear(); 1573 PtrRtCheck.Need = false; 1574 1575 // For each block. 1576 for (Loop::block_iterator bb = TheLoop->block_begin(), 1577 be = TheLoop->block_end(); bb != be; ++bb) { 1578 1579 // Scan the BB and collect legal loads and stores. 1580 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1581 ++it) { 1582 1583 // If this is a load, save it. If this instruction can read from memory 1584 // but is not a load, then we quit. Notice that we don't handle function 1585 // calls that read or write. 1586 if (it->mayReadFromMemory()) { 1587 LoadInst *Ld = dyn_cast<LoadInst>(it); 1588 if (!Ld) return false; 1589 if (!Ld->isSimple()) { 1590 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1591 return false; 1592 } 1593 Loads.push_back(Ld); 1594 continue; 1595 } 1596 1597 // Save 'store' instructions. Abort if other instructions write to memory. 1598 if (it->mayWriteToMemory()) { 1599 StoreInst *St = dyn_cast<StoreInst>(it); 1600 if (!St) return false; 1601 if (!St->isSimple()) { 1602 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1603 return false; 1604 } 1605 Stores.push_back(St); 1606 } 1607 } // next instr. 1608 } // next block. 1609 1610 // Now we have two lists that hold the loads and the stores. 1611 // Next, we find the pointers that they use. 1612 1613 // Check if we see any stores. If there are no stores, then we don't 1614 // care if the pointers are *restrict*. 1615 if (!Stores.size()) { 1616 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1617 return true; 1618 } 1619 1620 // Holds the read and read-write *pointers* that we find. 1621 ValueVector Reads; 1622 ValueVector ReadWrites; 1623 1624 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1625 // multiple times on the same object. If the ptr is accessed twice, once 1626 // for read and once for write, it will only appear once (on the write 1627 // list). This is okay, since we are going to check for conflicts between 1628 // writes and between reads and writes, but not between reads and reads. 1629 ValueSet Seen; 1630 1631 ValueVector::iterator I, IE; 1632 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1633 StoreInst *ST = cast<StoreInst>(*I); 1634 Value* Ptr = ST->getPointerOperand(); 1635 1636 if (isUniform(Ptr)) { 1637 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1638 return false; 1639 } 1640 1641 // If we did *not* see this pointer before, insert it to 1642 // the read-write list. At this phase it is only a 'write' list. 1643 if (Seen.insert(Ptr)) 1644 ReadWrites.push_back(Ptr); 1645 } 1646 1647 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1648 LoadInst *LD = cast<LoadInst>(*I); 1649 Value* Ptr = LD->getPointerOperand(); 1650 // If we did *not* see this pointer before, insert it to the 1651 // read list. If we *did* see it before, then it is already in 1652 // the read-write list. This allows us to vectorize expressions 1653 // such as A[i] += x; Because the address of A[i] is a read-write 1654 // pointer. This only works if the index of A[i] is consecutive. 1655 // If the address of i is unknown (for example A[B[i]]) then we may 1656 // read a few words, modify, and write a few words, and some of the 1657 // words may be written to the same address. 1658 if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) 1659 Reads.push_back(Ptr); 1660 } 1661 1662 // If we write (or read-write) to a single destination and there are no 1663 // other reads in this loop then is it safe to vectorize. 1664 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1665 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1666 return true; 1667 } 1668 1669 // Find pointers with computable bounds. We are going to use this information 1670 // to place a runtime bound check. 1671 bool CanDoRT = true; 1672 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1673 if (hasComputableBounds(*I)) { 1674 PtrRtCheck.insert(SE, TheLoop, *I); 1675 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1676 } else { 1677 CanDoRT = false; 1678 break; 1679 } 1680 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1681 if (hasComputableBounds(*I)) { 1682 PtrRtCheck.insert(SE, TheLoop, *I); 1683 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1684 } else { 1685 CanDoRT = false; 1686 break; 1687 } 1688 1689 // Check that we did not collect too many pointers or found a 1690 // unsizeable pointer. 1691 if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1692 PtrRtCheck.reset(); 1693 CanDoRT = false; 1694 } 1695 1696 if (CanDoRT) { 1697 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1698 } 1699 1700 bool NeedRTCheck = false; 1701 1702 // Now that the pointers are in two lists (Reads and ReadWrites), we 1703 // can check that there are no conflicts between each of the writes and 1704 // between the writes to the reads. 1705 ValueSet WriteObjects; 1706 ValueVector TempObjects; 1707 1708 // Check that the read-writes do not conflict with other read-write 1709 // pointers. 1710 bool AllWritesIdentified = true; 1711 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1712 GetUnderlyingObjects(*I, TempObjects, DL); 1713 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1714 it != e; ++it) { 1715 if (!isIdentifiedObject(*it)) { 1716 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1717 NeedRTCheck = true; 1718 AllWritesIdentified = false; 1719 } 1720 if (!WriteObjects.insert(*it)) { 1721 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1722 << **it <<"\n"); 1723 return false; 1724 } 1725 } 1726 TempObjects.clear(); 1727 } 1728 1729 /// Check that the reads don't conflict with the read-writes. 1730 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1731 GetUnderlyingObjects(*I, TempObjects, DL); 1732 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1733 it != e; ++it) { 1734 // If all of the writes are identified then we don't care if the read 1735 // pointer is identified or not. 1736 if (!AllWritesIdentified && !isIdentifiedObject(*it)) { 1737 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1738 NeedRTCheck = true; 1739 } 1740 if (WriteObjects.count(*it)) { 1741 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1742 << **it <<"\n"); 1743 return false; 1744 } 1745 } 1746 TempObjects.clear(); 1747 } 1748 1749 PtrRtCheck.Need = NeedRTCheck; 1750 if (NeedRTCheck && !CanDoRT) { 1751 DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 1752 "the array bounds.\n"); 1753 PtrRtCheck.reset(); 1754 return false; 1755 } 1756 1757 DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << 1758 " need a runtime memory check.\n"); 1759 return true; 1760} 1761 1762bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1763 ReductionKind Kind) { 1764 if (Phi->getNumIncomingValues() != 2) 1765 return false; 1766 1767 // Reduction variables are only found in the loop header block. 1768 if (Phi->getParent() != TheLoop->getHeader()) 1769 return false; 1770 1771 // Obtain the reduction start value from the value that comes from the loop 1772 // preheader. 1773 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 1774 1775 // ExitInstruction is the single value which is used outside the loop. 1776 // We only allow for a single reduction value to be used outside the loop. 1777 // This includes users of the reduction, variables (which form a cycle 1778 // which ends in the phi node). 1779 Instruction *ExitInstruction = 0; 1780 1781 // Iter is our iterator. We start with the PHI node and scan for all of the 1782 // users of this instruction. All users must be instructions that can be 1783 // used as reduction variables (such as ADD). We may have a single 1784 // out-of-block user. The cycle must end with the original PHI. 1785 Instruction *Iter = Phi; 1786 while (true) { 1787 // If the instruction has no users then this is a broken 1788 // chain and can't be a reduction variable. 1789 if (Iter->use_empty()) 1790 return false; 1791 1792 // Any reduction instr must be of one of the allowed kinds. 1793 if (!isReductionInstr(Iter, Kind)) 1794 return false; 1795 1796 // Did we find a user inside this loop already ? 1797 bool FoundInBlockUser = false; 1798 // Did we reach the initial PHI node already ? 1799 bool FoundStartPHI = false; 1800 1801 // For each of the *users* of iter. 1802 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1803 it != e; ++it) { 1804 Instruction *U = cast<Instruction>(*it); 1805 // We already know that the PHI is a user. 1806 if (U == Phi) { 1807 FoundStartPHI = true; 1808 continue; 1809 } 1810 1811 // Check if we found the exit user. 1812 BasicBlock *Parent = U->getParent(); 1813 if (!TheLoop->contains(Parent)) { 1814 // Exit if you find multiple outside users. 1815 if (ExitInstruction != 0) 1816 return false; 1817 ExitInstruction = Iter; 1818 } 1819 1820 // We allow in-loop PHINodes which are not the original reduction PHI 1821 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 1822 // structure) then don't skip this PHI. 1823 if (isa<PHINode>(Iter) && isa<PHINode>(U) && 1824 U->getParent() != TheLoop->getHeader() && 1825 TheLoop->contains(U) && 1826 Iter->getNumUses() > 1) 1827 continue; 1828 1829 // We can't have multiple inside users. 1830 if (FoundInBlockUser) 1831 return false; 1832 FoundInBlockUser = true; 1833 Iter = U; 1834 } 1835 1836 // We found a reduction var if we have reached the original 1837 // phi node and we only have a single instruction with out-of-loop 1838 // users. 1839 if (FoundStartPHI && ExitInstruction) { 1840 // This instruction is allowed to have out-of-loop users. 1841 AllowedExit.insert(ExitInstruction); 1842 1843 // Save the description of this reduction variable. 1844 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1845 Reductions[Phi] = RD; 1846 return true; 1847 } 1848 1849 // If we've reached the start PHI but did not find an outside user then 1850 // this is dead code. Abort. 1851 if (FoundStartPHI) 1852 return false; 1853 } 1854} 1855 1856bool 1857LoopVectorizationLegality::isReductionInstr(Instruction *I, 1858 ReductionKind Kind) { 1859 switch (I->getOpcode()) { 1860 default: 1861 return false; 1862 case Instruction::PHI: 1863 // possibly. 1864 return true; 1865 case Instruction::Add: 1866 case Instruction::Sub: 1867 return Kind == IntegerAdd; 1868 case Instruction::Mul: 1869 return Kind == IntegerMult; 1870 case Instruction::And: 1871 return Kind == IntegerAnd; 1872 case Instruction::Or: 1873 return Kind == IntegerOr; 1874 case Instruction::Xor: 1875 return Kind == IntegerXor; 1876 } 1877} 1878 1879LoopVectorizationLegality::InductionKind 1880LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1881 Type *PhiTy = Phi->getType(); 1882 // We only handle integer and pointer inductions variables. 1883 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1884 return NoInduction; 1885 1886 // Check that the PHI is consecutive and starts at zero. 1887 const SCEV *PhiScev = SE->getSCEV(Phi); 1888 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1889 if (!AR) { 1890 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1891 return NoInduction; 1892 } 1893 const SCEV *Step = AR->getStepRecurrence(*SE); 1894 1895 // Integer inductions need to have a stride of one. 1896 if (PhiTy->isIntegerTy()) { 1897 if (Step->isOne()) 1898 return IntInduction; 1899 if (Step->isAllOnesValue()) 1900 return ReverseIntInduction; 1901 return NoInduction; 1902 } 1903 1904 // Calculate the pointer stride and check if it is consecutive. 1905 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1906 if (!C) 1907 return NoInduction; 1908 1909 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1910 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1911 if (C->getValue()->equalsInt(Size)) 1912 return PtrInduction; 1913 1914 return NoInduction; 1915} 1916 1917bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 1918 Value *In0 = const_cast<Value*>(V); 1919 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 1920 if (!PN) 1921 return false; 1922 1923 return Inductions.count(PN); 1924} 1925 1926bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 1927 assert(TheLoop->contains(BB) && "Unknown block used"); 1928 1929 // Blocks that do not dominate the latch need predication. 1930 BasicBlock* Latch = TheLoop->getLoopLatch(); 1931 return !DT->dominates(BB, Latch); 1932} 1933 1934bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 1935 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1936 // We don't predicate loads/stores at the moment. 1937 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 1938 return false; 1939 1940 // The instructions below can trap. 1941 switch (it->getOpcode()) { 1942 default: continue; 1943 case Instruction::UDiv: 1944 case Instruction::SDiv: 1945 case Instruction::URem: 1946 case Instruction::SRem: 1947 return false; 1948 } 1949 } 1950 1951 return true; 1952} 1953 1954bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1955 const SCEV *PhiScev = SE->getSCEV(Ptr); 1956 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1957 if (!AR) 1958 return false; 1959 1960 return AR->isAffine(); 1961} 1962 1963unsigned 1964LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 1965 unsigned UserVF) { 1966 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 1967 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 1968 return 1; 1969 } 1970 1971 // Find the trip count. 1972 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 1973 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 1974 1975 unsigned VF = MaxVectorSize; 1976 1977 // If we optimize the program for size, avoid creating the tail loop. 1978 if (OptForSize) { 1979 // If we are unable to calculate the trip count then don't try to vectorize. 1980 if (TC < 2) { 1981 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 1982 return 1; 1983 } 1984 1985 // Find the maximum SIMD width that can fit within the trip count. 1986 VF = TC % MaxVectorSize; 1987 1988 if (VF == 0) 1989 VF = MaxVectorSize; 1990 1991 // If the trip count that we found modulo the vectorization factor is not 1992 // zero then we require a tail. 1993 if (VF < 2) { 1994 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 1995 return 1; 1996 } 1997 } 1998 1999 if (UserVF != 0) { 2000 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 2001 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 2002 2003 return UserVF; 2004 } 2005 2006 if (!VTTI) { 2007 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 2008 return 1; 2009 } 2010 2011 float Cost = expectedCost(1); 2012 unsigned Width = 1; 2013 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 2014 for (unsigned i=2; i <= VF; i*=2) { 2015 // Notice that the vector loop needs to be executed less times, so 2016 // we need to divide the cost of the vector loops by the width of 2017 // the vector elements. 2018 float VectorCost = expectedCost(i) / (float)i; 2019 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 2020 (int)VectorCost << ".\n"); 2021 if (VectorCost < Cost) { 2022 Cost = VectorCost; 2023 Width = i; 2024 } 2025 } 2026 2027 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 2028 return Width; 2029} 2030 2031unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 2032 unsigned Cost = 0; 2033 2034 // For each block. 2035 for (Loop::block_iterator bb = TheLoop->block_begin(), 2036 be = TheLoop->block_end(); bb != be; ++bb) { 2037 unsigned BlockCost = 0; 2038 BasicBlock *BB = *bb; 2039 2040 // For each instruction in the old loop. 2041 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2042 unsigned C = getInstructionCost(it, VF); 2043 Cost += C; 2044 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 2045 VF << " For instruction: "<< *it << "\n"); 2046 } 2047 2048 // We assume that if-converted blocks have a 50% chance of being executed. 2049 // When the code is scalar then some of the blocks are avoided due to CF. 2050 // When the code is vectorized we execute all code paths. 2051 if (Legal->blockNeedsPredication(*bb) && VF == 1) 2052 BlockCost /= 2; 2053 2054 Cost += BlockCost; 2055 } 2056 2057 return Cost; 2058} 2059 2060unsigned 2061LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 2062 assert(VTTI && "Invalid vector target transformation info"); 2063 2064 // If we know that this instruction will remain uniform, check the cost of 2065 // the scalar version. 2066 if (Legal->isUniformAfterVectorization(I)) 2067 VF = 1; 2068 2069 Type *RetTy = I->getType(); 2070 Type *VectorTy = ToVectorTy(RetTy, VF); 2071 2072 // TODO: We need to estimate the cost of intrinsic calls. 2073 switch (I->getOpcode()) { 2074 case Instruction::GetElementPtr: 2075 // We mark this instruction as zero-cost because scalar GEPs are usually 2076 // lowered to the intruction addressing mode. At the moment we don't 2077 // generate vector geps. 2078 return 0; 2079 case Instruction::Br: { 2080 return VTTI->getCFInstrCost(I->getOpcode()); 2081 } 2082 case Instruction::PHI: 2083 //TODO: IF-converted IFs become selects. 2084 return 0; 2085 case Instruction::Add: 2086 case Instruction::FAdd: 2087 case Instruction::Sub: 2088 case Instruction::FSub: 2089 case Instruction::Mul: 2090 case Instruction::FMul: 2091 case Instruction::UDiv: 2092 case Instruction::SDiv: 2093 case Instruction::FDiv: 2094 case Instruction::URem: 2095 case Instruction::SRem: 2096 case Instruction::FRem: 2097 case Instruction::Shl: 2098 case Instruction::LShr: 2099 case Instruction::AShr: 2100 case Instruction::And: 2101 case Instruction::Or: 2102 case Instruction::Xor: 2103 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 2104 case Instruction::Select: { 2105 SelectInst *SI = cast<SelectInst>(I); 2106 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 2107 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2108 Type *CondTy = SI->getCondition()->getType(); 2109 if (ScalarCond) 2110 CondTy = VectorType::get(CondTy, VF); 2111 2112 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2113 } 2114 case Instruction::ICmp: 2115 case Instruction::FCmp: { 2116 Type *ValTy = I->getOperand(0)->getType(); 2117 VectorTy = ToVectorTy(ValTy, VF); 2118 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2119 } 2120 case Instruction::Store: { 2121 StoreInst *SI = cast<StoreInst>(I); 2122 Type *ValTy = SI->getValueOperand()->getType(); 2123 VectorTy = ToVectorTy(ValTy, VF); 2124 2125 if (VF == 1) 2126 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2127 SI->getAlignment(), 2128 SI->getPointerAddressSpace()); 2129 2130 // Scalarized stores. 2131 int Stride = Legal->isConsecutivePtr(SI->getPointerOperand()); 2132 bool Reverse = Stride < 0; 2133 if (0 == Stride) { 2134 unsigned Cost = 0; 2135 2136 // The cost of extracting from the value vector and pointer vector. 2137 Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2138 for (unsigned i = 0; i < VF; ++i) { 2139 Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, 2140 VectorTy, i); 2141 Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, 2142 PtrTy, i); 2143 } 2144 2145 // The cost of the scalar stores. 2146 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2147 ValTy->getScalarType(), 2148 SI->getAlignment(), 2149 SI->getPointerAddressSpace()); 2150 return Cost; 2151 } 2152 2153 // Wide stores. 2154 unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2155 SI->getAlignment(), 2156 SI->getPointerAddressSpace()); 2157 if (Reverse) 2158 Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, 2159 VectorTy, 0); 2160 return Cost; 2161 } 2162 case Instruction::Load: { 2163 LoadInst *LI = cast<LoadInst>(I); 2164 2165 if (VF == 1) 2166 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2167 LI->getAlignment(), 2168 LI->getPointerAddressSpace()); 2169 2170 // Scalarized loads. 2171 int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); 2172 bool Reverse = Stride < 0; 2173 if (0 == Stride) { 2174 unsigned Cost = 0; 2175 Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2176 2177 // The cost of extracting from the pointer vector. 2178 for (unsigned i = 0; i < VF; ++i) 2179 Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, 2180 PtrTy, i); 2181 2182 // The cost of inserting data to the result vector. 2183 for (unsigned i = 0; i < VF; ++i) 2184 Cost += VTTI->getVectorInstrCost(Instruction::InsertElement, 2185 VectorTy, i); 2186 2187 // The cost of the scalar stores. 2188 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2189 RetTy->getScalarType(), 2190 LI->getAlignment(), 2191 LI->getPointerAddressSpace()); 2192 return Cost; 2193 } 2194 2195 // Wide loads. 2196 unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, 2197 LI->getAlignment(), 2198 LI->getPointerAddressSpace()); 2199 if (Reverse) 2200 Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, 2201 VectorTy, 0); 2202 return Cost; 2203 } 2204 case Instruction::ZExt: 2205 case Instruction::SExt: 2206 case Instruction::FPToUI: 2207 case Instruction::FPToSI: 2208 case Instruction::FPExt: 2209 case Instruction::PtrToInt: 2210 case Instruction::IntToPtr: 2211 case Instruction::SIToFP: 2212 case Instruction::UIToFP: 2213 case Instruction::Trunc: 2214 case Instruction::FPTrunc: 2215 case Instruction::BitCast: { 2216 // We optimize the truncation of induction variable. 2217 // The cost of these is the same as the scalar operation. 2218 if (I->getOpcode() == Instruction::Trunc && 2219 Legal->isInductionVariable(I->getOperand(0))) 2220 return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), 2221 I->getOperand(0)->getType()); 2222 2223 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2224 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2225 } 2226 case Instruction::Call: { 2227 assert(isTriviallyVectorizableIntrinsic(I)); 2228 IntrinsicInst *II = cast<IntrinsicInst>(I); 2229 Type *RetTy = ToVectorTy(II->getType(), VF); 2230 SmallVector<Type*, 4> Tys; 2231 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 2232 Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); 2233 return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); 2234 } 2235 default: { 2236 // We are scalarizing the instruction. Return the cost of the scalar 2237 // instruction, plus the cost of insert and extract into vector 2238 // elements, times the vector width. 2239 unsigned Cost = 0; 2240 2241 if (!RetTy->isVoidTy() && VF != 1) { 2242 unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement, 2243 VectorTy); 2244 unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, 2245 VectorTy); 2246 2247 // The cost of inserting the results plus extracting each one of the 2248 // operands. 2249 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2250 } 2251 2252 // The cost of executing VF copies of the scalar instruction. This opcode 2253 // is unknown. Assume that it is the same as 'mul'. 2254 Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); 2255 return Cost; 2256 } 2257 }// end of switch. 2258} 2259 2260Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2261 if (Scalar->isVoidTy() || VF == 1) 2262 return Scalar; 2263 return VectorType::get(Scalar, VF); 2264} 2265 2266char LoopVectorize::ID = 0; 2267static const char lv_name[] = "Loop Vectorization"; 2268INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2269INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2270INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2271INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2272INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2273 2274namespace llvm { 2275 Pass *createLoopVectorizePass() { 2276 return new LoopVectorize(); 2277 } 2278} 2279 2280 2281