LoopVectorize.cpp revision d3071bb042303ed239e5c770fd4026bfc339fcd0
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9#include "LoopVectorize.h" 10#include "llvm/ADT/StringExtras.h" 11#include "llvm/Analysis/AliasAnalysis.h" 12#include "llvm/Analysis/AliasSetTracker.h" 13#include "llvm/Analysis/Dominators.h" 14#include "llvm/Analysis/LoopInfo.h" 15#include "llvm/Analysis/LoopIterator.h" 16#include "llvm/Analysis/LoopPass.h" 17#include "llvm/Analysis/ScalarEvolutionExpander.h" 18#include "llvm/Analysis/ScalarEvolutionExpander.h" 19#include "llvm/Analysis/ScalarEvolutionExpressions.h" 20#include "llvm/Analysis/ValueTracking.h" 21#include "llvm/Analysis/Verifier.h" 22#include "llvm/Constants.h" 23#include "llvm/DataLayout.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/Function.h" 26#include "llvm/Instructions.h" 27#include "llvm/IntrinsicInst.h" 28#include "llvm/LLVMContext.h" 29#include "llvm/Module.h" 30#include "llvm/Pass.h" 31#include "llvm/Support/CommandLine.h" 32#include "llvm/Support/Debug.h" 33#include "llvm/Support/raw_ostream.h" 34#include "llvm/TargetTransformInfo.h" 35#include "llvm/Transforms/Scalar.h" 36#include "llvm/Transforms/Utils/BasicBlockUtils.h" 37#include "llvm/Transforms/Utils/Local.h" 38#include "llvm/Transforms/Vectorize.h" 39#include "llvm/Type.h" 40#include "llvm/Value.h" 41 42static cl::opt<unsigned> 43VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 44 cl::desc("Sets the SIMD width. Zero is autoselect.")); 45 46static cl::opt<bool> 47EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 48 cl::desc("Enable if-conversion during vectorization.")); 49 50namespace { 51 52/// The LoopVectorize Pass. 53struct LoopVectorize : public LoopPass { 54 /// Pass identification, replacement for typeid 55 static char ID; 56 57 explicit LoopVectorize() : LoopPass(ID) { 58 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 59 } 60 61 ScalarEvolution *SE; 62 DataLayout *DL; 63 LoopInfo *LI; 64 TargetTransformInfo *TTI; 65 DominatorTree *DT; 66 67 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 68 // We only vectorize innermost loops. 69 if (!L->empty()) 70 return false; 71 72 SE = &getAnalysis<ScalarEvolution>(); 73 DL = getAnalysisIfAvailable<DataLayout>(); 74 LI = &getAnalysis<LoopInfo>(); 75 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 76 DT = &getAnalysis<DominatorTree>(); 77 78 DEBUG(dbgs() << "LV: Checking a loop in \"" << 79 L->getHeader()->getParent()->getName() << "\"\n"); 80 81 // Check if it is legal to vectorize the loop. 82 LoopVectorizationLegality LVL(L, SE, DL, DT); 83 if (!LVL.canVectorize()) { 84 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 85 return false; 86 } 87 88 // Select the preffered vectorization factor. 89 const VectorTargetTransformInfo *VTTI = 0; 90 if (TTI) 91 VTTI = TTI->getVectorTargetTransformInfo(); 92 // Use the cost model. 93 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 94 95 // Check the function attribues to find out if this function should be 96 // optimized for size. 97 Function *F = L->getHeader()->getParent(); 98 Attribute::AttrVal SzAttr= Attribute::OptimizeForSize; 99 bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr); 100 101 unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 102 103 if (VF == 1) { 104 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 105 return false; 106 } 107 108 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 109 F->getParent()->getModuleIdentifier()<<"\n"); 110 111 // If we decided that it is *legal* to vectorizer the loop then do it. 112 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); 113 LB.vectorize(&LVL); 114 115 DEBUG(verifyFunction(*L->getHeader()->getParent())); 116 return true; 117 } 118 119 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 120 LoopPass::getAnalysisUsage(AU); 121 AU.addRequiredID(LoopSimplifyID); 122 AU.addRequiredID(LCSSAID); 123 AU.addRequired<LoopInfo>(); 124 AU.addRequired<ScalarEvolution>(); 125 AU.addRequired<DominatorTree>(); 126 AU.addPreserved<LoopInfo>(); 127 AU.addPreserved<DominatorTree>(); 128 } 129 130}; 131 132}// namespace 133 134//===----------------------------------------------------------------------===// 135// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 136// LoopVectorizationCostModel. 137//===----------------------------------------------------------------------===// 138 139void 140LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 141 Loop *Lp, Value *Ptr) { 142 const SCEV *Sc = SE->getSCEV(Ptr); 143 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 144 assert(AR && "Invalid addrec expression"); 145 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 146 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 147 Pointers.push_back(Ptr); 148 Starts.push_back(AR->getStart()); 149 Ends.push_back(ScEnd); 150} 151 152Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 153 // Create the types. 154 LLVMContext &C = V->getContext(); 155 Type *VTy = VectorType::get(V->getType(), VF); 156 Type *I32 = IntegerType::getInt32Ty(C); 157 158 // Save the current insertion location. 159 Instruction *Loc = Builder.GetInsertPoint(); 160 161 // We need to place the broadcast of invariant variables outside the loop. 162 Instruction *Instr = dyn_cast<Instruction>(V); 163 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 164 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 165 166 // Place the code for broadcasting invariant variables in the new preheader. 167 if (Invariant) 168 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 169 170 Constant *Zero = ConstantInt::get(I32, 0); 171 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 172 Value *UndefVal = UndefValue::get(VTy); 173 // Insert the value into a new vector. 174 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 175 // Broadcast the scalar into all locations in the vector. 176 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 177 "broadcast"); 178 179 // Restore the builder insertion point. 180 if (Invariant) 181 Builder.SetInsertPoint(Loc); 182 183 return Shuf; 184} 185 186Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { 187 assert(Val->getType()->isVectorTy() && "Must be a vector"); 188 assert(Val->getType()->getScalarType()->isIntegerTy() && 189 "Elem must be an integer"); 190 // Create the types. 191 Type *ITy = Val->getType()->getScalarType(); 192 VectorType *Ty = cast<VectorType>(Val->getType()); 193 int VLen = Ty->getNumElements(); 194 SmallVector<Constant*, 8> Indices; 195 196 // Create a vector of consecutive numbers from zero to VF. 197 for (int i = 0; i < VLen; ++i) 198 Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i )); 199 200 // Add the consecutive indices to the vector value. 201 Constant *Cv = ConstantVector::get(Indices); 202 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 203 return Builder.CreateAdd(Val, Cv, "induction"); 204} 205 206bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 207 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 208 209 // If this value is a pointer induction variable we know it is consecutive. 210 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 211 if (Phi && Inductions.count(Phi)) { 212 InductionInfo II = Inductions[Phi]; 213 if (PtrInduction == II.IK) 214 return true; 215 } 216 217 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 218 if (!Gep) 219 return false; 220 221 unsigned NumOperands = Gep->getNumOperands(); 222 Value *LastIndex = Gep->getOperand(NumOperands - 1); 223 224 // Check that all of the gep indices are uniform except for the last. 225 for (unsigned i = 0; i < NumOperands - 1; ++i) 226 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 227 return false; 228 229 // We can emit wide load/stores only if the last index is the induction 230 // variable. 231 const SCEV *Last = SE->getSCEV(LastIndex); 232 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 233 const SCEV *Step = AR->getStepRecurrence(*SE); 234 235 // The memory is consecutive because the last index is consecutive 236 // and all other indices are loop invariant. 237 if (Step->isOne()) 238 return true; 239 } 240 241 return false; 242} 243 244bool LoopVectorizationLegality::isUniform(Value *V) { 245 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 246} 247 248Value *InnerLoopVectorizer::getVectorValue(Value *V) { 249 assert(V != Induction && "The new induction variable should not be used."); 250 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 251 // If we saved a vectorized copy of V, use it. 252 Value *&MapEntry = WidenMap[V]; 253 if (MapEntry) 254 return MapEntry; 255 256 // Broadcast V and save the value for future uses. 257 Value *B = getBroadcastInstrs(V); 258 MapEntry = B; 259 return B; 260} 261 262Constant* 263InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 264 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 265} 266 267void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 268 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 269 // Holds vector parameters or scalars, in case of uniform vals. 270 SmallVector<Value*, 8> Params; 271 272 // Find all of the vectorized parameters. 273 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 274 Value *SrcOp = Instr->getOperand(op); 275 276 // If we are accessing the old induction variable, use the new one. 277 if (SrcOp == OldInduction) { 278 Params.push_back(getVectorValue(SrcOp)); 279 continue; 280 } 281 282 // Try using previously calculated values. 283 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 284 285 // If the src is an instruction that appeared earlier in the basic block 286 // then it should already be vectorized. 287 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 288 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 289 // The parameter is a vector value from earlier. 290 Params.push_back(WidenMap[SrcInst]); 291 } else { 292 // The parameter is a scalar from outside the loop. Maybe even a constant. 293 Params.push_back(SrcOp); 294 } 295 } 296 297 assert(Params.size() == Instr->getNumOperands() && 298 "Invalid number of operands"); 299 300 // Does this instruction return a value ? 301 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 302 Value *VecResults = 0; 303 304 // If we have a return value, create an empty vector. We place the scalarized 305 // instructions in this vector. 306 if (!IsVoidRetTy) 307 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 308 309 // For each scalar that we create: 310 for (unsigned i = 0; i < VF; ++i) { 311 Instruction *Cloned = Instr->clone(); 312 if (!IsVoidRetTy) 313 Cloned->setName(Instr->getName() + ".cloned"); 314 // Replace the operands of the cloned instrucions with extracted scalars. 315 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 316 Value *Op = Params[op]; 317 // Param is a vector. Need to extract the right lane. 318 if (Op->getType()->isVectorTy()) 319 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 320 Cloned->setOperand(op, Op); 321 } 322 323 // Place the cloned scalar in the new loop. 324 Builder.Insert(Cloned); 325 326 // If the original scalar returns a value we need to place it in a vector 327 // so that future users will be able to use it. 328 if (!IsVoidRetTy) 329 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 330 Builder.getInt32(i)); 331 } 332 333 if (!IsVoidRetTy) 334 WidenMap[Instr] = VecResults; 335} 336 337Value* 338InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 339 Instruction *Loc) { 340 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 341 Legal->getRuntimePointerCheck(); 342 343 if (!PtrRtCheck->Need) 344 return NULL; 345 346 Value *MemoryRuntimeCheck = 0; 347 unsigned NumPointers = PtrRtCheck->Pointers.size(); 348 SmallVector<Value* , 2> Starts; 349 SmallVector<Value* , 2> Ends; 350 351 SCEVExpander Exp(*SE, "induction"); 352 353 // Use this type for pointer arithmetic. 354 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 355 356 for (unsigned i = 0; i < NumPointers; ++i) { 357 Value *Ptr = PtrRtCheck->Pointers[i]; 358 const SCEV *Sc = SE->getSCEV(Ptr); 359 360 if (SE->isLoopInvariant(Sc, OrigLoop)) { 361 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 362 *Ptr <<"\n"); 363 Starts.push_back(Ptr); 364 Ends.push_back(Ptr); 365 } else { 366 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 367 368 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 369 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 370 Starts.push_back(Start); 371 Ends.push_back(End); 372 } 373 } 374 375 for (unsigned i = 0; i < NumPointers; ++i) { 376 for (unsigned j = i+1; j < NumPointers; ++j) { 377 Instruction::CastOps Op = Instruction::BitCast; 378 Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); 379 Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); 380 Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); 381 Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); 382 383 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 384 Start0, End1, "bound0", Loc); 385 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 386 Start1, End0, "bound1", Loc); 387 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 388 "found.conflict", Loc); 389 if (MemoryRuntimeCheck) 390 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 391 MemoryRuntimeCheck, 392 IsConflict, 393 "conflict.rdx", Loc); 394 else 395 MemoryRuntimeCheck = IsConflict; 396 397 } 398 } 399 400 return MemoryRuntimeCheck; 401} 402 403void 404InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 405 /* 406 In this function we generate a new loop. The new loop will contain 407 the vectorized instructions while the old loop will continue to run the 408 scalar remainder. 409 410 [ ] <-- vector loop bypass. 411 / | 412 / v 413 | [ ] <-- vector pre header. 414 | | 415 | v 416 | [ ] \ 417 | [ ]_| <-- vector loop. 418 | | 419 \ v 420 >[ ] <--- middle-block. 421 / | 422 / v 423 | [ ] <--- new preheader. 424 | | 425 | v 426 | [ ] \ 427 | [ ]_| <-- old scalar loop to handle remainder. 428 \ | 429 \ v 430 >[ ] <-- exit block. 431 ... 432 */ 433 434 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 435 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 436 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 437 assert(ExitBlock && "Must have an exit block"); 438 439 // Some loops have a single integer induction variable, while other loops 440 // don't. One example is c++ iterators that often have multiple pointer 441 // induction variables. In the code below we also support a case where we 442 // don't have a single induction variable. 443 OldInduction = Legal->getInduction(); 444 Type *IdxTy = OldInduction ? OldInduction->getType() : 445 DL->getIntPtrType(SE->getContext()); 446 447 // Find the loop boundaries. 448 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 449 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 450 451 // Get the total trip count from the count by adding 1. 452 ExitCount = SE->getAddExpr(ExitCount, 453 SE->getConstant(ExitCount->getType(), 1)); 454 455 // Expand the trip count and place the new instructions in the preheader. 456 // Notice that the pre-header does not change, only the loop body. 457 SCEVExpander Exp(*SE, "induction"); 458 459 // Count holds the overall loop count (N). 460 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 461 BypassBlock->getTerminator()); 462 463 // The loop index does not have to start at Zero. Find the original start 464 // value from the induction PHI node. If we don't have an induction variable 465 // then we know that it starts at zero. 466 Value *StartIdx = OldInduction ? 467 OldInduction->getIncomingValueForBlock(BypassBlock): 468 ConstantInt::get(IdxTy, 0); 469 470 assert(BypassBlock && "Invalid loop structure"); 471 472 // Generate the code that checks in runtime if arrays overlap. 473 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 474 BypassBlock->getTerminator()); 475 476 // Split the single block loop into the two loop structure described above. 477 BasicBlock *VectorPH = 478 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 479 BasicBlock *VecBody = 480 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 481 BasicBlock *MiddleBlock = 482 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 483 BasicBlock *ScalarPH = 484 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 485 486 // This is the location in which we add all of the logic for bypassing 487 // the new vector loop. 488 Instruction *Loc = BypassBlock->getTerminator(); 489 490 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 491 // inside the loop. 492 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 493 494 // Generate the induction variable. 495 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 496 Constant *Step = ConstantInt::get(IdxTy, VF); 497 498 // We may need to extend the index in case there is a type mismatch. 499 // We know that the count starts at zero and does not overflow. 500 if (Count->getType() != IdxTy) { 501 // The exit count can be of pointer type. Convert it to the correct 502 // integer type. 503 if (ExitCount->getType()->isPointerTy()) 504 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 505 else 506 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 507 } 508 509 // Add the start index to the loop count to get the new end index. 510 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 511 512 // Now we need to generate the expression for N - (N % VF), which is 513 // the part that the vectorized body will execute. 514 Constant *CIVF = ConstantInt::get(IdxTy, VF); 515 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 516 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 517 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 518 "end.idx.rnd.down", Loc); 519 520 // Now, compare the new count to zero. If it is zero skip the vector loop and 521 // jump to the scalar loop. 522 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 523 IdxEndRoundDown, 524 StartIdx, 525 "cmp.zero", Loc); 526 527 // If we are using memory runtime checks, include them in. 528 if (MemoryRuntimeCheck) 529 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 530 "CntOrMem", Loc); 531 532 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 533 // Remove the old terminator. 534 Loc->eraseFromParent(); 535 536 // We are going to resume the execution of the scalar loop. 537 // Go over all of the induction variables that we found and fix the 538 // PHIs that are left in the scalar version of the loop. 539 // The starting values of PHI nodes depend on the counter of the last 540 // iteration in the vectorized loop. 541 // If we come from a bypass edge then we need to start from the original 542 // start value. 543 544 // This variable saves the new starting index for the scalar loop. 545 PHINode *ResumeIndex = 0; 546 LoopVectorizationLegality::InductionList::iterator I, E; 547 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 548 for (I = List->begin(), E = List->end(); I != E; ++I) { 549 PHINode *OrigPhi = I->first; 550 LoopVectorizationLegality::InductionInfo II = I->second; 551 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 552 MiddleBlock->getTerminator()); 553 Value *EndValue = 0; 554 switch (II.IK) { 555 case LoopVectorizationLegality::NoInduction: 556 llvm_unreachable("Unknown induction"); 557 case LoopVectorizationLegality::IntInduction: { 558 // Handle the integer induction counter: 559 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 560 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 561 // We know what the end value is. 562 EndValue = IdxEndRoundDown; 563 // We also know which PHI node holds it. 564 ResumeIndex = ResumeVal; 565 break; 566 } 567 case LoopVectorizationLegality::ReverseIntInduction: { 568 // Convert the CountRoundDown variable to the PHI size. 569 unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); 570 unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); 571 Value *CRD = CountRoundDown; 572 if (CRDSize > IISize) 573 CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, 574 II.StartValue->getType(), 575 "tr.crd", BypassBlock->getTerminator()); 576 else if (CRDSize < IISize) 577 CRD = CastInst::Create(Instruction::SExt, CountRoundDown, 578 II.StartValue->getType(), 579 "sext.crd", BypassBlock->getTerminator()); 580 // Handle reverse integer induction counter: 581 EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", 582 BypassBlock->getTerminator()); 583 break; 584 } 585 case LoopVectorizationLegality::PtrInduction: { 586 // For pointer induction variables, calculate the offset using 587 // the end index. 588 EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, 589 "ptr.ind.end", 590 BypassBlock->getTerminator()); 591 break; 592 } 593 }// end of case 594 595 // The new PHI merges the original incoming value, in case of a bypass, 596 // or the value at the end of the vectorized loop. 597 ResumeVal->addIncoming(II.StartValue, BypassBlock); 598 ResumeVal->addIncoming(EndValue, VecBody); 599 600 // Fix the scalar body counter (PHI node). 601 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 602 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 603 } 604 605 // If we are generating a new induction variable then we also need to 606 // generate the code that calculates the exit value. This value is not 607 // simply the end of the counter because we may skip the vectorized body 608 // in case of a runtime check. 609 if (!OldInduction){ 610 assert(!ResumeIndex && "Unexpected resume value found"); 611 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 612 MiddleBlock->getTerminator()); 613 ResumeIndex->addIncoming(StartIdx, BypassBlock); 614 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 615 } 616 617 // Make sure that we found the index where scalar loop needs to continue. 618 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 619 "Invalid resume Index"); 620 621 // Add a check in the middle block to see if we have completed 622 // all of the iterations in the first vector loop. 623 // If (N - N%VF) == N, then we *don't* need to run the remainder. 624 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 625 ResumeIndex, "cmp.n", 626 MiddleBlock->getTerminator()); 627 628 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 629 // Remove the old terminator. 630 MiddleBlock->getTerminator()->eraseFromParent(); 631 632 // Create i+1 and fill the PHINode. 633 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 634 Induction->addIncoming(StartIdx, VectorPH); 635 Induction->addIncoming(NextIdx, VecBody); 636 // Create the compare. 637 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 638 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 639 640 // Now we have two terminators. Remove the old one from the block. 641 VecBody->getTerminator()->eraseFromParent(); 642 643 // Get ready to start creating new instructions into the vectorized body. 644 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 645 646 // Create and register the new vector loop. 647 Loop* Lp = new Loop(); 648 Loop *ParentLoop = OrigLoop->getParentLoop(); 649 650 // Insert the new loop into the loop nest and register the new basic blocks. 651 if (ParentLoop) { 652 ParentLoop->addChildLoop(Lp); 653 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 654 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 655 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 656 } else { 657 LI->addTopLevelLoop(Lp); 658 } 659 660 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 661 662 // Save the state. 663 LoopVectorPreHeader = VectorPH; 664 LoopScalarPreHeader = ScalarPH; 665 LoopMiddleBlock = MiddleBlock; 666 LoopExitBlock = ExitBlock; 667 LoopVectorBody = VecBody; 668 LoopScalarBody = OldBasicBlock; 669 LoopBypassBlock = BypassBlock; 670} 671 672/// This function returns the identity element (or neutral element) for 673/// the operation K. 674static unsigned 675getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 676 switch (K) { 677 case LoopVectorizationLegality::IntegerXor: 678 case LoopVectorizationLegality::IntegerAdd: 679 case LoopVectorizationLegality::IntegerOr: 680 // Adding, Xoring, Oring zero to a number does not change it. 681 return 0; 682 case LoopVectorizationLegality::IntegerMult: 683 // Multiplying a number by 1 does not change it. 684 return 1; 685 case LoopVectorizationLegality::IntegerAnd: 686 // AND-ing a number with an all-1 value does not change it. 687 return -1; 688 default: 689 llvm_unreachable("Unknown reduction kind"); 690 } 691} 692 693static bool 694isTriviallyVectorizableIntrinsic(Instruction *Inst) { 695 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 696 if (!II) 697 return false; 698 switch (II->getIntrinsicID()) { 699 case Intrinsic::sqrt: 700 case Intrinsic::sin: 701 case Intrinsic::cos: 702 case Intrinsic::exp: 703 case Intrinsic::exp2: 704 case Intrinsic::log: 705 case Intrinsic::log10: 706 case Intrinsic::log2: 707 case Intrinsic::fabs: 708 case Intrinsic::floor: 709 case Intrinsic::ceil: 710 case Intrinsic::trunc: 711 case Intrinsic::rint: 712 case Intrinsic::nearbyint: 713 case Intrinsic::pow: 714 case Intrinsic::fma: 715 return true; 716 default: 717 return false; 718 } 719 return false; 720} 721 722void 723InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 724 //===------------------------------------------------===// 725 // 726 // Notice: any optimization or new instruction that go 727 // into the code below should be also be implemented in 728 // the cost-model. 729 // 730 //===------------------------------------------------===// 731 BasicBlock &BB = *OrigLoop->getHeader(); 732 Constant *Zero = 733 ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); 734 735 // In order to support reduction variables we need to be able to vectorize 736 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 737 // stages. First, we create a new vector PHI node with no incoming edges. 738 // We use this value when we vectorize all of the instructions that use the 739 // PHI. Next, after all of the instructions in the block are complete we 740 // add the new incoming edges to the PHI. At this point all of the 741 // instructions in the basic block are vectorized, so we can use them to 742 // construct the PHI. 743 PhiVector RdxPHIsToFix; 744 745 // Scan the loop in a topological order to ensure that defs are vectorized 746 // before users. 747 LoopBlocksDFS DFS(OrigLoop); 748 DFS.perform(LI); 749 750 // Vectorize all of the blocks in the original loop. 751 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 752 be = DFS.endRPO(); bb != be; ++bb) 753 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 754 755 // At this point every instruction in the original loop is widened to 756 // a vector form. We are almost done. Now, we need to fix the PHI nodes 757 // that we vectorized. The PHI nodes are currently empty because we did 758 // not want to introduce cycles. Notice that the remaining PHI nodes 759 // that we need to fix are reduction variables. 760 761 // Create the 'reduced' values for each of the induction vars. 762 // The reduced values are the vector values that we scalarize and combine 763 // after the loop is finished. 764 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 765 it != e; ++it) { 766 PHINode *RdxPhi = *it; 767 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 768 assert(RdxPhi && "Unable to recover vectorized PHI"); 769 770 // Find the reduction variable descriptor. 771 assert(Legal->getReductionVars()->count(RdxPhi) && 772 "Unable to find the reduction variable"); 773 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 774 (*Legal->getReductionVars())[RdxPhi]; 775 776 // We need to generate a reduction vector from the incoming scalar. 777 // To do so, we need to generate the 'identity' vector and overide 778 // one of the elements with the incoming scalar reduction. We need 779 // to do it in the vector-loop preheader. 780 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 781 782 // This is the vector-clone of the value that leaves the loop. 783 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 784 Type *VecTy = VectorExit->getType(); 785 786 // Find the reduction identity variable. Zero for addition, or, xor, 787 // one for multiplication, -1 for And. 788 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 789 VecTy->getScalarType()); 790 791 // This vector is the Identity vector where the first element is the 792 // incoming scalar reduction. 793 Value *VectorStart = Builder.CreateInsertElement(Identity, 794 RdxDesc.StartValue, Zero); 795 796 // Fix the vector-loop phi. 797 // We created the induction variable so we know that the 798 // preheader is the first entry. 799 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 800 801 // Reductions do not have to start at zero. They can start with 802 // any loop invariant values. 803 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 804 Value *Val = 805 getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 806 VecRdxPhi->addIncoming(Val, LoopVectorBody); 807 808 // Before each round, move the insertion point right between 809 // the PHIs and the values we are going to write. 810 // This allows us to write both PHINodes and the extractelement 811 // instructions. 812 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 813 814 // This PHINode contains the vectorized reduction variable, or 815 // the initial value vector, if we bypass the vector loop. 816 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 817 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 818 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 819 820 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 821 // and vector ops, reducing the set of values being computed by half each 822 // round. 823 assert(isPowerOf2_32(VF) && 824 "Reduction emission only supported for pow2 vectors!"); 825 Value *TmpVec = NewPhi; 826 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 827 for (unsigned i = VF; i != 1; i >>= 1) { 828 // Move the upper half of the vector to the lower half. 829 for (unsigned j = 0; j != i/2; ++j) 830 ShuffleMask[j] = Builder.getInt32(i/2 + j); 831 832 // Fill the rest of the mask with undef. 833 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 834 UndefValue::get(Builder.getInt32Ty())); 835 836 Value *Shuf = 837 Builder.CreateShuffleVector(TmpVec, 838 UndefValue::get(TmpVec->getType()), 839 ConstantVector::get(ShuffleMask), 840 "rdx.shuf"); 841 842 // Emit the operation on the shuffled value. 843 switch (RdxDesc.Kind) { 844 case LoopVectorizationLegality::IntegerAdd: 845 TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); 846 break; 847 case LoopVectorizationLegality::IntegerMult: 848 TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); 849 break; 850 case LoopVectorizationLegality::IntegerOr: 851 TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); 852 break; 853 case LoopVectorizationLegality::IntegerAnd: 854 TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); 855 break; 856 case LoopVectorizationLegality::IntegerXor: 857 TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); 858 break; 859 default: 860 llvm_unreachable("Unknown reduction operation"); 861 } 862 } 863 864 // The result is in the first element of the vector. 865 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 866 867 // Now, we need to fix the users of the reduction variable 868 // inside and outside of the scalar remainder loop. 869 // We know that the loop is in LCSSA form. We need to update the 870 // PHI nodes in the exit blocks. 871 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 872 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 873 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 874 if (!LCSSAPhi) continue; 875 876 // All PHINodes need to have a single entry edge, or two if 877 // we already fixed them. 878 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 879 880 // We found our reduction value exit-PHI. Update it with the 881 // incoming bypass edge. 882 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 883 // Add an edge coming from the bypass. 884 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 885 break; 886 } 887 }// end of the LCSSA phi scan. 888 889 // Fix the scalar loop reduction variable with the incoming reduction sum 890 // from the vector body and from the backedge value. 891 int IncomingEdgeBlockIdx = 892 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 893 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 894 // Pick the other block. 895 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 896 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 897 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 898 }// end of for each redux variable. 899} 900 901Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 902 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 903 "Invalid edge"); 904 905 Value *SrcMask = createBlockInMask(Src); 906 907 // The terminator has to be a branch inst! 908 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 909 assert(BI && "Unexpected terminator found"); 910 911 Value *EdgeMask = SrcMask; 912 if (BI->isConditional()) { 913 EdgeMask = getVectorValue(BI->getCondition()); 914 if (BI->getSuccessor(0) != Dst) 915 EdgeMask = Builder.CreateNot(EdgeMask); 916 } 917 918 return Builder.CreateAnd(EdgeMask, SrcMask); 919} 920 921Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 922 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 923 924 // Loop incoming mask is all-one. 925 if (OrigLoop->getHeader() == BB) { 926 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 927 return getVectorValue(C); 928 } 929 930 // This is the block mask. We OR all incoming edges, and with zero. 931 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 932 Value *BlockMask = getVectorValue(Zero); 933 934 // For each pred: 935 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) 936 BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); 937 938 return BlockMask; 939} 940 941void 942InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 943 BasicBlock *BB, PhiVector *PV) { 944 Constant *Zero = 945 ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); 946 947 // For each instruction in the old loop. 948 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 949 switch (it->getOpcode()) { 950 case Instruction::Br: 951 // Nothing to do for PHIs and BR, since we already took care of the 952 // loop control flow instructions. 953 continue; 954 case Instruction::PHI:{ 955 PHINode* P = cast<PHINode>(it); 956 // Handle reduction variables: 957 if (Legal->getReductionVars()->count(P)) { 958 // This is phase one of vectorizing PHIs. 959 Type *VecTy = VectorType::get(it->getType(), VF); 960 WidenMap[it] = 961 PHINode::Create(VecTy, 2, "vec.phi", 962 LoopVectorBody->getFirstInsertionPt()); 963 PV->push_back(P); 964 continue; 965 } 966 967 // Check for PHI nodes that are lowered to vector selects. 968 if (P->getParent() != OrigLoop->getHeader()) { 969 // We know that all PHIs in non header blocks are converted into 970 // selects, so we don't have to worry about the insertion order and we 971 // can just use the builder. 972 973 // At this point we generate the predication tree. There may be 974 // duplications since this is a simple recursive scan, but future 975 // optimizations will clean it up. 976 Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent()); 977 WidenMap[P] = 978 Builder.CreateSelect(Cond, 979 getVectorValue(P->getIncomingValue(0)), 980 getVectorValue(P->getIncomingValue(1)), 981 "predphi"); 982 continue; 983 } 984 985 // This PHINode must be an induction variable. 986 // Make sure that we know about it. 987 assert(Legal->getInductionVars()->count(P) && 988 "Not an induction variable"); 989 990 LoopVectorizationLegality::InductionInfo II = 991 Legal->getInductionVars()->lookup(P); 992 993 switch (II.IK) { 994 case LoopVectorizationLegality::NoInduction: 995 llvm_unreachable("Unknown induction"); 996 case LoopVectorizationLegality::IntInduction: { 997 assert(P == OldInduction && "Unexpected PHI"); 998 Value *Broadcasted = getBroadcastInstrs(Induction); 999 // After broadcasting the induction variable we need to make the 1000 // vector consecutive by adding 0, 1, 2 ... 1001 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); 1002 WidenMap[OldInduction] = ConsecutiveInduction; 1003 continue; 1004 } 1005 case LoopVectorizationLegality::ReverseIntInduction: 1006 case LoopVectorizationLegality::PtrInduction: 1007 // Handle reverse integer and pointer inductions. 1008 Value *StartIdx = 0; 1009 // If we have a single integer induction variable then use it. 1010 // Otherwise, start counting at zero. 1011 if (OldInduction) { 1012 LoopVectorizationLegality::InductionInfo OldII = 1013 Legal->getInductionVars()->lookup(OldInduction); 1014 StartIdx = OldII.StartValue; 1015 } else { 1016 StartIdx = ConstantInt::get(Induction->getType(), 0); 1017 } 1018 // This is the normalized GEP that starts counting at zero. 1019 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1020 "normalized.idx"); 1021 1022 // Handle the reverse integer induction variable case. 1023 if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { 1024 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 1025 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 1026 "resize.norm.idx"); 1027 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 1028 "reverse.idx"); 1029 1030 // This is a new value so do not hoist it out. 1031 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 1032 // After broadcasting the induction variable we need to make the 1033 // vector consecutive by adding ... -3, -2, -1, 0. 1034 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, 1035 true); 1036 WidenMap[it] = ConsecutiveInduction; 1037 continue; 1038 } 1039 1040 // Handle the pointer induction variable case. 1041 assert(P->getType()->isPointerTy() && "Unexpected type."); 1042 1043 // This is the vector of results. Notice that we don't generate 1044 // vector geps because scalar geps result in better code. 1045 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1046 for (unsigned int i = 0; i < VF; ++i) { 1047 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1048 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, 1049 "gep.idx"); 1050 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 1051 "next.gep"); 1052 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1053 Builder.getInt32(i), 1054 "insert.gep"); 1055 } 1056 1057 WidenMap[it] = VecVal; 1058 continue; 1059 } 1060 1061 }// End of PHI. 1062 1063 case Instruction::Add: 1064 case Instruction::FAdd: 1065 case Instruction::Sub: 1066 case Instruction::FSub: 1067 case Instruction::Mul: 1068 case Instruction::FMul: 1069 case Instruction::UDiv: 1070 case Instruction::SDiv: 1071 case Instruction::FDiv: 1072 case Instruction::URem: 1073 case Instruction::SRem: 1074 case Instruction::FRem: 1075 case Instruction::Shl: 1076 case Instruction::LShr: 1077 case Instruction::AShr: 1078 case Instruction::And: 1079 case Instruction::Or: 1080 case Instruction::Xor: { 1081 // Just widen binops. 1082 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 1083 Value *A = getVectorValue(it->getOperand(0)); 1084 Value *B = getVectorValue(it->getOperand(1)); 1085 1086 // Use this vector value for all users of the original instruction. 1087 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1088 WidenMap[it] = V; 1089 1090 // Update the NSW, NUW and Exact flags. 1091 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1092 if (isa<OverflowingBinaryOperator>(BinOp)) { 1093 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1094 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1095 } 1096 if (isa<PossiblyExactOperator>(VecOp)) 1097 VecOp->setIsExact(BinOp->isExact()); 1098 break; 1099 } 1100 case Instruction::Select: { 1101 // Widen selects. 1102 // If the selector is loop invariant we can create a select 1103 // instruction with a scalar condition. Otherwise, use vector-select. 1104 Value *Cond = it->getOperand(0); 1105 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1106 1107 // The condition can be loop invariant but still defined inside the 1108 // loop. This means that we can't just use the original 'cond' value. 1109 // We have to take the 'vectorized' value and pick the first lane. 1110 // Instcombine will make this a no-op. 1111 Cond = getVectorValue(Cond); 1112 if (InvariantCond) 1113 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1114 1115 Value *Op0 = getVectorValue(it->getOperand(1)); 1116 Value *Op1 = getVectorValue(it->getOperand(2)); 1117 WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); 1118 break; 1119 } 1120 1121 case Instruction::ICmp: 1122 case Instruction::FCmp: { 1123 // Widen compares. Generate vector compares. 1124 bool FCmp = (it->getOpcode() == Instruction::FCmp); 1125 CmpInst *Cmp = dyn_cast<CmpInst>(it); 1126 Value *A = getVectorValue(it->getOperand(0)); 1127 Value *B = getVectorValue(it->getOperand(1)); 1128 if (FCmp) 1129 WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1130 else 1131 WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1132 break; 1133 } 1134 1135 case Instruction::Store: { 1136 // Attempt to issue a wide store. 1137 StoreInst *SI = dyn_cast<StoreInst>(it); 1138 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1139 Value *Ptr = SI->getPointerOperand(); 1140 unsigned Alignment = SI->getAlignment(); 1141 1142 assert(!Legal->isUniform(Ptr) && 1143 "We do not allow storing to uniform addresses"); 1144 1145 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1146 1147 // This store does not use GEPs. 1148 if (!Legal->isConsecutivePtr(Ptr)) { 1149 scalarizeInstruction(it); 1150 break; 1151 } 1152 1153 if (Gep) { 1154 // The last index does not have to be the induction. It can be 1155 // consecutive and be a function of the index. For example A[I+1]; 1156 unsigned NumOperands = Gep->getNumOperands(); 1157 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1158 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1159 1160 // Create the new GEP with the new induction variable. 1161 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1162 Gep2->setOperand(NumOperands - 1, LastIndex); 1163 Ptr = Builder.Insert(Gep2); 1164 } else { 1165 // Use the induction element ptr. 1166 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1167 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1168 } 1169 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1170 Value *Val = getVectorValue(SI->getValueOperand()); 1171 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1172 break; 1173 } 1174 case Instruction::Load: { 1175 // Attempt to issue a wide load. 1176 LoadInst *LI = dyn_cast<LoadInst>(it); 1177 Type *RetTy = VectorType::get(LI->getType(), VF); 1178 Value *Ptr = LI->getPointerOperand(); 1179 unsigned Alignment = LI->getAlignment(); 1180 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1181 1182 // If the pointer is loop invariant or if it is non consecutive, 1183 // scalarize the load. 1184 bool Con = Legal->isConsecutivePtr(Ptr); 1185 if (Legal->isUniform(Ptr) || !Con) { 1186 scalarizeInstruction(it); 1187 break; 1188 } 1189 1190 if (Gep) { 1191 // The last index does not have to be the induction. It can be 1192 // consecutive and be a function of the index. For example A[I+1]; 1193 unsigned NumOperands = Gep->getNumOperands(); 1194 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1195 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1196 1197 // Create the new GEP with the new induction variable. 1198 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1199 Gep2->setOperand(NumOperands - 1, LastIndex); 1200 Ptr = Builder.Insert(Gep2); 1201 } else { 1202 // Use the induction element ptr. 1203 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1204 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1205 } 1206 1207 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1208 LI = Builder.CreateLoad(Ptr); 1209 LI->setAlignment(Alignment); 1210 // Use this vector value for all users of the load. 1211 WidenMap[it] = LI; 1212 break; 1213 } 1214 case Instruction::ZExt: 1215 case Instruction::SExt: 1216 case Instruction::FPToUI: 1217 case Instruction::FPToSI: 1218 case Instruction::FPExt: 1219 case Instruction::PtrToInt: 1220 case Instruction::IntToPtr: 1221 case Instruction::SIToFP: 1222 case Instruction::UIToFP: 1223 case Instruction::Trunc: 1224 case Instruction::FPTrunc: 1225 case Instruction::BitCast: { 1226 CastInst *CI = dyn_cast<CastInst>(it); 1227 /// Optimize the special case where the source is the induction 1228 /// variable. Notice that we can only optimize the 'trunc' case 1229 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 1230 /// c. other casts depend on pointer size. 1231 if (CI->getOperand(0) == OldInduction && 1232 it->getOpcode() == Instruction::Trunc) { 1233 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 1234 CI->getType()); 1235 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 1236 WidenMap[it] = getConsecutiveVector(Broadcasted); 1237 break; 1238 } 1239 /// Vectorize casts. 1240 Value *A = getVectorValue(it->getOperand(0)); 1241 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1242 WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1243 break; 1244 } 1245 1246 case Instruction::Call: { 1247 assert(isTriviallyVectorizableIntrinsic(it)); 1248 Module *M = BB->getParent()->getParent(); 1249 IntrinsicInst *II = cast<IntrinsicInst>(it); 1250 Intrinsic::ID ID = II->getIntrinsicID(); 1251 SmallVector<Value*, 4> Args; 1252 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 1253 Args.push_back(getVectorValue(II->getArgOperand(i))); 1254 Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; 1255 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 1256 WidenMap[it] = Builder.CreateCall(F, Args); 1257 break; 1258 } 1259 1260 default: 1261 // All other instructions are unsupported. Scalarize them. 1262 scalarizeInstruction(it); 1263 break; 1264 }// end of switch. 1265 }// end of for_each instr. 1266} 1267 1268void InnerLoopVectorizer::updateAnalysis() { 1269 // Forget the original basic block. 1270 SE->forgetLoop(OrigLoop); 1271 1272 // Update the dominator tree information. 1273 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1274 "Entry does not dominate exit."); 1275 1276 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1277 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1278 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1279 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1280 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1281 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1282 1283 DEBUG(DT->verifyAnalysis()); 1284} 1285 1286bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1287 if (!EnableIfConversion) 1288 return false; 1289 1290 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1291 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 1292 1293 // Collect the blocks that need predication. 1294 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 1295 BasicBlock *BB = LoopBlocks[i]; 1296 1297 // We don't support switch statements inside loops. 1298 if (!isa<BranchInst>(BB->getTerminator())) 1299 return false; 1300 1301 // We must have at most two predecessors because we need to convert 1302 // all PHIs to selects. 1303 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 1304 if (Preds > 2) 1305 return false; 1306 1307 // We must be able to predicate all blocks that need to be predicated. 1308 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 1309 return false; 1310 } 1311 1312 // We can if-convert this loop. 1313 return true; 1314} 1315 1316bool LoopVectorizationLegality::canVectorize() { 1317 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 1318 1319 // We can only vectorize innermost loops. 1320 if (TheLoop->getSubLoopsVector().size()) 1321 return false; 1322 1323 // We must have a single backedge. 1324 if (TheLoop->getNumBackEdges() != 1) 1325 return false; 1326 1327 // We must have a single exiting block. 1328 if (!TheLoop->getExitingBlock()) 1329 return false; 1330 1331 unsigned NumBlocks = TheLoop->getNumBlocks(); 1332 1333 // Check if we can if-convert non single-bb loops. 1334 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1335 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1336 return false; 1337 } 1338 1339 // We need to have a loop header. 1340 BasicBlock *Latch = TheLoop->getLoopLatch(); 1341 DEBUG(dbgs() << "LV: Found a loop: " << 1342 TheLoop->getHeader()->getName() << "\n"); 1343 1344 // ScalarEvolution needs to be able to find the exit count. 1345 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 1346 if (ExitCount == SE->getCouldNotCompute()) { 1347 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1348 return false; 1349 } 1350 1351 // Do not loop-vectorize loops with a tiny trip count. 1352 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 1353 if (TC > 0u && TC < TinyTripCountThreshold) { 1354 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1355 "This loop is not worth vectorizing.\n"); 1356 return false; 1357 } 1358 1359 // Check if we can vectorize the instructions and CFG in this loop. 1360 if (!canVectorizeInstrs()) { 1361 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1362 return false; 1363 } 1364 1365 // Go over each instruction and look at memory deps. 1366 if (!canVectorizeMemory()) { 1367 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1368 return false; 1369 } 1370 1371 // Collect all of the variables that remain uniform after vectorization. 1372 collectLoopUniforms(); 1373 1374 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1375 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1376 <<"!\n"); 1377 1378 // Okay! We can vectorize. At this point we don't have any other mem analysis 1379 // which may limit our maximum vectorization factor, so just return true with 1380 // no restrictions. 1381 return true; 1382} 1383 1384bool LoopVectorizationLegality::canVectorizeInstrs() { 1385 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1386 BasicBlock *Header = TheLoop->getHeader(); 1387 1388 // For each block in the loop. 1389 for (Loop::block_iterator bb = TheLoop->block_begin(), 1390 be = TheLoop->block_end(); bb != be; ++bb) { 1391 1392 // Scan the instructions in the block and look for hazards. 1393 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1394 ++it) { 1395 1396 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 1397 // This should not happen because the loop should be normalized. 1398 if (Phi->getNumIncomingValues() != 2) { 1399 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1400 return false; 1401 } 1402 1403 // Check that this PHI type is allowed. 1404 if (!Phi->getType()->isIntegerTy() && 1405 !Phi->getType()->isPointerTy()) { 1406 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1407 return false; 1408 } 1409 1410 // If this PHINode is not in the header block, then we know that we 1411 // can convert it to select during if-conversion. No need to check if 1412 // the PHIs in this block are induction or reduction variables. 1413 if (*bb != Header) 1414 continue; 1415 1416 // This is the value coming from the preheader. 1417 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1418 // Check if this is an induction variable. 1419 InductionKind IK = isInductionVariable(Phi); 1420 1421 if (NoInduction != IK) { 1422 // Int inductions are special because we only allow one IV. 1423 if (IK == IntInduction) { 1424 if (Induction) { 1425 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1426 return false; 1427 } 1428 Induction = Phi; 1429 } 1430 1431 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 1432 Inductions[Phi] = InductionInfo(StartValue, IK); 1433 continue; 1434 } 1435 1436 if (AddReductionVar(Phi, IntegerAdd)) { 1437 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1438 continue; 1439 } 1440 if (AddReductionVar(Phi, IntegerMult)) { 1441 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1442 continue; 1443 } 1444 if (AddReductionVar(Phi, IntegerOr)) { 1445 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1446 continue; 1447 } 1448 if (AddReductionVar(Phi, IntegerAnd)) { 1449 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1450 continue; 1451 } 1452 if (AddReductionVar(Phi, IntegerXor)) { 1453 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1454 continue; 1455 } 1456 1457 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1458 return false; 1459 }// end of PHI handling 1460 1461 // We still don't handle functions. 1462 CallInst *CI = dyn_cast<CallInst>(it); 1463 if (CI && !isTriviallyVectorizableIntrinsic(it)) { 1464 DEBUG(dbgs() << "LV: Found a call site.\n"); 1465 return false; 1466 } 1467 1468 // We do not re-vectorize vectors. 1469 if (!VectorType::isValidElementType(it->getType()) && 1470 !it->getType()->isVoidTy()) { 1471 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1472 return false; 1473 } 1474 1475 // Reduction instructions are allowed to have exit users. 1476 // All other instructions must not have external users. 1477 if (!AllowedExit.count(it)) 1478 //Check that all of the users of the loop are inside the BB. 1479 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 1480 I != E; ++I) { 1481 Instruction *U = cast<Instruction>(*I); 1482 // This user may be a reduction exit value. 1483 if (!TheLoop->contains(U)) { 1484 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1485 return false; 1486 } 1487 } 1488 } // next instr. 1489 1490 } 1491 1492 if (!Induction) { 1493 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1494 assert(getInductionVars()->size() && "No induction variables"); 1495 } 1496 1497 return true; 1498} 1499 1500void LoopVectorizationLegality::collectLoopUniforms() { 1501 // We now know that the loop is vectorizable! 1502 // Collect variables that will remain uniform after vectorization. 1503 std::vector<Value*> Worklist; 1504 BasicBlock *Latch = TheLoop->getLoopLatch(); 1505 1506 // Start with the conditional branch and walk up the block. 1507 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 1508 1509 while (Worklist.size()) { 1510 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1511 Worklist.pop_back(); 1512 1513 // Look at instructions inside this loop. 1514 // Stop when reaching PHI nodes. 1515 // TODO: we need to follow values all over the loop, not only in this block. 1516 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 1517 continue; 1518 1519 // This is a known uniform. 1520 Uniforms.insert(I); 1521 1522 // Insert all operands. 1523 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 1524 Worklist.push_back(I->getOperand(i)); 1525 } 1526 } 1527} 1528 1529bool LoopVectorizationLegality::canVectorizeMemory() { 1530 typedef SmallVector<Value*, 16> ValueVector; 1531 typedef SmallPtrSet<Value*, 16> ValueSet; 1532 // Holds the Load and Store *instructions*. 1533 ValueVector Loads; 1534 ValueVector Stores; 1535 PtrRtCheck.Pointers.clear(); 1536 PtrRtCheck.Need = false; 1537 1538 // For each block. 1539 for (Loop::block_iterator bb = TheLoop->block_begin(), 1540 be = TheLoop->block_end(); bb != be; ++bb) { 1541 1542 // Scan the BB and collect legal loads and stores. 1543 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1544 ++it) { 1545 1546 // If this is a load, save it. If this instruction can read from memory 1547 // but is not a load, then we quit. Notice that we don't handle function 1548 // calls that read or write. 1549 if (it->mayReadFromMemory()) { 1550 LoadInst *Ld = dyn_cast<LoadInst>(it); 1551 if (!Ld) return false; 1552 if (!Ld->isSimple()) { 1553 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1554 return false; 1555 } 1556 Loads.push_back(Ld); 1557 continue; 1558 } 1559 1560 // Save 'store' instructions. Abort if other instructions write to memory. 1561 if (it->mayWriteToMemory()) { 1562 StoreInst *St = dyn_cast<StoreInst>(it); 1563 if (!St) return false; 1564 if (!St->isSimple()) { 1565 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1566 return false; 1567 } 1568 Stores.push_back(St); 1569 } 1570 } // next instr. 1571 } // next block. 1572 1573 // Now we have two lists that hold the loads and the stores. 1574 // Next, we find the pointers that they use. 1575 1576 // Check if we see any stores. If there are no stores, then we don't 1577 // care if the pointers are *restrict*. 1578 if (!Stores.size()) { 1579 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1580 return true; 1581 } 1582 1583 // Holds the read and read-write *pointers* that we find. 1584 ValueVector Reads; 1585 ValueVector ReadWrites; 1586 1587 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1588 // multiple times on the same object. If the ptr is accessed twice, once 1589 // for read and once for write, it will only appear once (on the write 1590 // list). This is okay, since we are going to check for conflicts between 1591 // writes and between reads and writes, but not between reads and reads. 1592 ValueSet Seen; 1593 1594 ValueVector::iterator I, IE; 1595 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1596 StoreInst *ST = dyn_cast<StoreInst>(*I); 1597 assert(ST && "Bad StoreInst"); 1598 Value* Ptr = ST->getPointerOperand(); 1599 1600 if (isUniform(Ptr)) { 1601 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1602 return false; 1603 } 1604 1605 // If we did *not* see this pointer before, insert it to 1606 // the read-write list. At this phase it is only a 'write' list. 1607 if (Seen.insert(Ptr)) 1608 ReadWrites.push_back(Ptr); 1609 } 1610 1611 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1612 LoadInst *LD = dyn_cast<LoadInst>(*I); 1613 assert(LD && "Bad LoadInst"); 1614 Value* Ptr = LD->getPointerOperand(); 1615 // If we did *not* see this pointer before, insert it to the 1616 // read list. If we *did* see it before, then it is already in 1617 // the read-write list. This allows us to vectorize expressions 1618 // such as A[i] += x; Because the address of A[i] is a read-write 1619 // pointer. This only works if the index of A[i] is consecutive. 1620 // If the address of i is unknown (for example A[B[i]]) then we may 1621 // read a few words, modify, and write a few words, and some of the 1622 // words may be written to the same address. 1623 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1624 Reads.push_back(Ptr); 1625 } 1626 1627 // If we write (or read-write) to a single destination and there are no 1628 // other reads in this loop then is it safe to vectorize. 1629 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1630 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1631 return true; 1632 } 1633 1634 // Find pointers with computable bounds. We are going to use this information 1635 // to place a runtime bound check. 1636 bool RT = true; 1637 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1638 if (hasComputableBounds(*I)) { 1639 PtrRtCheck.insert(SE, TheLoop, *I); 1640 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1641 } else { 1642 RT = false; 1643 break; 1644 } 1645 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1646 if (hasComputableBounds(*I)) { 1647 PtrRtCheck.insert(SE, TheLoop, *I); 1648 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1649 } else { 1650 RT = false; 1651 break; 1652 } 1653 1654 // Check that we did not collect too many pointers or found a 1655 // unsizeable pointer. 1656 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1657 PtrRtCheck.reset(); 1658 RT = false; 1659 } 1660 1661 PtrRtCheck.Need = RT; 1662 1663 if (RT) { 1664 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1665 } 1666 1667 // Now that the pointers are in two lists (Reads and ReadWrites), we 1668 // can check that there are no conflicts between each of the writes and 1669 // between the writes to the reads. 1670 ValueSet WriteObjects; 1671 ValueVector TempObjects; 1672 1673 // Check that the read-writes do not conflict with other read-write 1674 // pointers. 1675 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1676 GetUnderlyingObjects(*I, TempObjects, DL); 1677 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1678 it != e; ++it) { 1679 if (!isIdentifiedObject(*it)) { 1680 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1681 return RT; 1682 } 1683 if (!WriteObjects.insert(*it)) { 1684 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1685 << **it <<"\n"); 1686 return RT; 1687 } 1688 } 1689 TempObjects.clear(); 1690 } 1691 1692 /// Check that the reads don't conflict with the read-writes. 1693 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1694 GetUnderlyingObjects(*I, TempObjects, DL); 1695 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1696 it != e; ++it) { 1697 if (!isIdentifiedObject(*it)) { 1698 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1699 return RT; 1700 } 1701 if (WriteObjects.count(*it)) { 1702 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1703 << **it <<"\n"); 1704 return RT; 1705 } 1706 } 1707 TempObjects.clear(); 1708 } 1709 1710 // It is safe to vectorize and we don't need any runtime checks. 1711 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1712 PtrRtCheck.reset(); 1713 return true; 1714} 1715 1716bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1717 ReductionKind Kind) { 1718 if (Phi->getNumIncomingValues() != 2) 1719 return false; 1720 1721 // Reduction variables are only found in the loop header block. 1722 if (Phi->getParent() != TheLoop->getHeader()) 1723 return false; 1724 1725 // Obtain the reduction start value from the value that comes from the loop 1726 // preheader. 1727 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 1728 1729 // ExitInstruction is the single value which is used outside the loop. 1730 // We only allow for a single reduction value to be used outside the loop. 1731 // This includes users of the reduction, variables (which form a cycle 1732 // which ends in the phi node). 1733 Instruction *ExitInstruction = 0; 1734 1735 // Iter is our iterator. We start with the PHI node and scan for all of the 1736 // users of this instruction. All users must be instructions which can be 1737 // used as reduction variables (such as ADD). We may have a single 1738 // out-of-block user. They cycle must end with the original PHI. 1739 // Also, we can't have multiple block-local users. 1740 Instruction *Iter = Phi; 1741 while (true) { 1742 // If the instruction has no users then this is a broken 1743 // chain and can't be a reduction variable. 1744 if (Iter->use_empty()) 1745 return false; 1746 1747 // Any reduction instr must be of one of the allowed kinds. 1748 if (!isReductionInstr(Iter, Kind)) 1749 return false; 1750 1751 // Did we find a user inside this block ? 1752 bool FoundInBlockUser = false; 1753 // Did we reach the initial PHI node ? 1754 bool FoundStartPHI = false; 1755 1756 // For each of the *users* of iter. 1757 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1758 it != e; ++it) { 1759 Instruction *U = cast<Instruction>(*it); 1760 // We already know that the PHI is a user. 1761 if (U == Phi) { 1762 FoundStartPHI = true; 1763 continue; 1764 } 1765 1766 // Check if we found the exit user. 1767 BasicBlock *Parent = U->getParent(); 1768 if (!TheLoop->contains(Parent)) { 1769 // Exit if you find multiple outside users. 1770 if (ExitInstruction != 0) 1771 return false; 1772 ExitInstruction = Iter; 1773 } 1774 1775 // We allow in-loop PHINodes which are not the original reduction PHI 1776 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 1777 // structure) then don't skip this PHI. 1778 if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() && 1779 TheLoop->contains(U) && Iter->getNumUses() > 1) 1780 continue; 1781 1782 // We can't have multiple inside users. 1783 if (FoundInBlockUser) 1784 return false; 1785 FoundInBlockUser = true; 1786 Iter = U; 1787 } 1788 1789 // We found a reduction var if we have reached the original 1790 // phi node and we only have a single instruction with out-of-loop 1791 // users. 1792 if (FoundStartPHI && ExitInstruction) { 1793 // This instruction is allowed to have out-of-loop users. 1794 AllowedExit.insert(ExitInstruction); 1795 1796 // Save the description of this reduction variable. 1797 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1798 Reductions[Phi] = RD; 1799 return true; 1800 } 1801 1802 // If we've reached the start PHI but did not find an outside user then 1803 // this is dead code. Abort. 1804 if (FoundStartPHI) 1805 return false; 1806 } 1807} 1808 1809bool 1810LoopVectorizationLegality::isReductionInstr(Instruction *I, 1811 ReductionKind Kind) { 1812 switch (I->getOpcode()) { 1813 default: 1814 return false; 1815 case Instruction::PHI: 1816 // possibly. 1817 return true; 1818 case Instruction::Add: 1819 case Instruction::Sub: 1820 return Kind == IntegerAdd; 1821 case Instruction::Mul: 1822 return Kind == IntegerMult; 1823 case Instruction::And: 1824 return Kind == IntegerAnd; 1825 case Instruction::Or: 1826 return Kind == IntegerOr; 1827 case Instruction::Xor: 1828 return Kind == IntegerXor; 1829 } 1830} 1831 1832LoopVectorizationLegality::InductionKind 1833LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1834 Type *PhiTy = Phi->getType(); 1835 // We only handle integer and pointer inductions variables. 1836 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1837 return NoInduction; 1838 1839 // Check that the PHI is consecutive and starts at zero. 1840 const SCEV *PhiScev = SE->getSCEV(Phi); 1841 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1842 if (!AR) { 1843 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1844 return NoInduction; 1845 } 1846 const SCEV *Step = AR->getStepRecurrence(*SE); 1847 1848 // Integer inductions need to have a stride of one. 1849 if (PhiTy->isIntegerTy()) { 1850 if (Step->isOne()) 1851 return IntInduction; 1852 if (Step->isAllOnesValue()) 1853 return ReverseIntInduction; 1854 return NoInduction; 1855 } 1856 1857 // Calculate the pointer stride and check if it is consecutive. 1858 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1859 if (!C) 1860 return NoInduction; 1861 1862 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1863 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1864 if (C->getValue()->equalsInt(Size)) 1865 return PtrInduction; 1866 1867 return NoInduction; 1868} 1869 1870bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 1871 Value *In0 = const_cast<Value*>(V); 1872 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 1873 if (!PN) 1874 return false; 1875 1876 return Inductions.count(PN); 1877} 1878 1879bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 1880 assert(TheLoop->contains(BB) && "Unknown block used"); 1881 1882 // Blocks that do not dominate the latch need predication. 1883 BasicBlock* Latch = TheLoop->getLoopLatch(); 1884 return !DT->dominates(BB, Latch); 1885} 1886 1887bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 1888 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1889 // We don't predicate loads/stores at the moment. 1890 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 1891 return false; 1892 1893 // The instructions below can trap. 1894 switch (it->getOpcode()) { 1895 default: continue; 1896 case Instruction::UDiv: 1897 case Instruction::SDiv: 1898 case Instruction::URem: 1899 case Instruction::SRem: 1900 return false; 1901 } 1902 } 1903 1904 return true; 1905} 1906 1907bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1908 const SCEV *PhiScev = SE->getSCEV(Ptr); 1909 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1910 if (!AR) 1911 return false; 1912 1913 return AR->isAffine(); 1914} 1915 1916unsigned 1917LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 1918 unsigned UserVF) { 1919 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 1920 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 1921 return 1; 1922 } 1923 1924 // Find the trip count. 1925 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 1926 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 1927 1928 unsigned VF = MaxVectorSize; 1929 1930 // If we optimize the program for size, avoid creating the tail loop. 1931 if (OptForSize) { 1932 // If we are unable to calculate the trip count then don't try to vectorize. 1933 if (TC < 2) { 1934 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 1935 return 1; 1936 } 1937 1938 // Find the maximum SIMD width that can fit within the trip count. 1939 VF = TC % MaxVectorSize; 1940 1941 if (VF == 0) 1942 VF = MaxVectorSize; 1943 1944 // If the trip count that we found modulo the vectorization factor is not 1945 // zero then we require a tail. 1946 if (VF < 2) { 1947 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 1948 return 1; 1949 } 1950 } 1951 1952 if (UserVF != 0) { 1953 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 1954 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 1955 1956 return UserVF; 1957 } 1958 1959 if (!VTTI) { 1960 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1961 return 1; 1962 } 1963 1964 float Cost = expectedCost(1); 1965 unsigned Width = 1; 1966 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1967 for (unsigned i=2; i <= VF; i*=2) { 1968 // Notice that the vector loop needs to be executed less times, so 1969 // we need to divide the cost of the vector loops by the width of 1970 // the vector elements. 1971 float VectorCost = expectedCost(i) / (float)i; 1972 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1973 (int)VectorCost << ".\n"); 1974 if (VectorCost < Cost) { 1975 Cost = VectorCost; 1976 Width = i; 1977 } 1978 } 1979 1980 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1981 return Width; 1982} 1983 1984unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1985 unsigned Cost = 0; 1986 1987 // For each block. 1988 for (Loop::block_iterator bb = TheLoop->block_begin(), 1989 be = TheLoop->block_end(); bb != be; ++bb) { 1990 unsigned BlockCost = 0; 1991 BasicBlock *BB = *bb; 1992 1993 // For each instruction in the old loop. 1994 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1995 unsigned C = getInstructionCost(it, VF); 1996 Cost += C; 1997 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 1998 VF << " For instruction: "<< *it << "\n"); 1999 } 2000 2001 // We assume that if-converted blocks have a 50% chance of being executed. 2002 // When the code is scalar then some of the blocks are avoided due to CF. 2003 // When the code is vectorized we execute all code paths. 2004 if (Legal->blockNeedsPredication(*bb) && VF == 1) 2005 BlockCost /= 2; 2006 2007 Cost += BlockCost; 2008 } 2009 2010 return Cost; 2011} 2012 2013unsigned 2014LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 2015 assert(VTTI && "Invalid vector target transformation info"); 2016 2017 // If we know that this instruction will remain uniform, check the cost of 2018 // the scalar version. 2019 if (Legal->isUniformAfterVectorization(I)) 2020 VF = 1; 2021 2022 Type *RetTy = I->getType(); 2023 Type *VectorTy = ToVectorTy(RetTy, VF); 2024 2025 // TODO: We need to estimate the cost of intrinsic calls. 2026 switch (I->getOpcode()) { 2027 case Instruction::GetElementPtr: 2028 // We mark this instruction as zero-cost because scalar GEPs are usually 2029 // lowered to the intruction addressing mode. At the moment we don't 2030 // generate vector geps. 2031 return 0; 2032 case Instruction::Br: { 2033 return VTTI->getCFInstrCost(I->getOpcode()); 2034 } 2035 case Instruction::PHI: 2036 //TODO: IF-converted IFs become selects. 2037 return 0; 2038 case Instruction::Add: 2039 case Instruction::FAdd: 2040 case Instruction::Sub: 2041 case Instruction::FSub: 2042 case Instruction::Mul: 2043 case Instruction::FMul: 2044 case Instruction::UDiv: 2045 case Instruction::SDiv: 2046 case Instruction::FDiv: 2047 case Instruction::URem: 2048 case Instruction::SRem: 2049 case Instruction::FRem: 2050 case Instruction::Shl: 2051 case Instruction::LShr: 2052 case Instruction::AShr: 2053 case Instruction::And: 2054 case Instruction::Or: 2055 case Instruction::Xor: 2056 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 2057 case Instruction::Select: { 2058 SelectInst *SI = cast<SelectInst>(I); 2059 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 2060 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2061 Type *CondTy = SI->getCondition()->getType(); 2062 if (ScalarCond) 2063 CondTy = VectorType::get(CondTy, VF); 2064 2065 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2066 } 2067 case Instruction::ICmp: 2068 case Instruction::FCmp: { 2069 Type *ValTy = I->getOperand(0)->getType(); 2070 VectorTy = ToVectorTy(ValTy, VF); 2071 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2072 } 2073 case Instruction::Store: { 2074 StoreInst *SI = cast<StoreInst>(I); 2075 Type *ValTy = SI->getValueOperand()->getType(); 2076 VectorTy = ToVectorTy(ValTy, VF); 2077 2078 if (VF == 1) 2079 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 2080 SI->getAlignment(), 2081 SI->getPointerAddressSpace()); 2082 2083 // Scalarized stores. 2084 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2085 unsigned Cost = 0; 2086 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2087 ValTy); 2088 // The cost of extracting from the value vector. 2089 Cost += VF * (ExtCost); 2090 // The cost of the scalar stores. 2091 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2092 ValTy->getScalarType(), 2093 SI->getAlignment(), 2094 SI->getPointerAddressSpace()); 2095 return Cost; 2096 } 2097 2098 // Wide stores. 2099 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2100 SI->getPointerAddressSpace()); 2101 } 2102 case Instruction::Load: { 2103 LoadInst *LI = cast<LoadInst>(I); 2104 2105 if (VF == 1) 2106 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2107 LI->getAlignment(), 2108 LI->getPointerAddressSpace()); 2109 2110 // Scalarized loads. 2111 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2112 unsigned Cost = 0; 2113 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2114 // The cost of inserting the loaded value into the result vector. 2115 Cost += VF * (InCost); 2116 // The cost of the scalar stores. 2117 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2118 RetTy->getScalarType(), 2119 LI->getAlignment(), 2120 LI->getPointerAddressSpace()); 2121 return Cost; 2122 } 2123 2124 // Wide loads. 2125 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2126 LI->getPointerAddressSpace()); 2127 } 2128 case Instruction::ZExt: 2129 case Instruction::SExt: 2130 case Instruction::FPToUI: 2131 case Instruction::FPToSI: 2132 case Instruction::FPExt: 2133 case Instruction::PtrToInt: 2134 case Instruction::IntToPtr: 2135 case Instruction::SIToFP: 2136 case Instruction::UIToFP: 2137 case Instruction::Trunc: 2138 case Instruction::FPTrunc: 2139 case Instruction::BitCast: { 2140 // We optimize the truncation of induction variable. 2141 // The cost of these is the same as the scalar operation. 2142 if (I->getOpcode() == Instruction::Trunc && 2143 Legal->isInductionVariable(I->getOperand(0))) 2144 return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), 2145 I->getOperand(0)->getType()); 2146 2147 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2148 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2149 } 2150 case Instruction::Call: { 2151 assert(isTriviallyVectorizableIntrinsic(I)); 2152 IntrinsicInst *II = cast<IntrinsicInst>(I); 2153 Type *RetTy = ToVectorTy(II->getType(), VF); 2154 SmallVector<Type*, 4> Tys; 2155 for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 2156 Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); 2157 return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); 2158 } 2159 default: { 2160 // We are scalarizing the instruction. Return the cost of the scalar 2161 // instruction, plus the cost of insert and extract into vector 2162 // elements, times the vector width. 2163 unsigned Cost = 0; 2164 2165 bool IsVoid = RetTy->isVoidTy(); 2166 2167 unsigned InsCost = (IsVoid ? 0 : 2168 VTTI->getInstrCost(Instruction::InsertElement, 2169 VectorTy)); 2170 2171 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2172 VectorTy); 2173 2174 // The cost of inserting the results plus extracting each one of the 2175 // operands. 2176 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2177 2178 // The cost of executing VF copies of the scalar instruction. 2179 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2180 return Cost; 2181 } 2182 }// end of switch. 2183} 2184 2185Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2186 if (Scalar->isVoidTy() || VF == 1) 2187 return Scalar; 2188 return VectorType::get(Scalar, VF); 2189} 2190 2191char LoopVectorize::ID = 0; 2192static const char lv_name[] = "Loop Vectorization"; 2193INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2194INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2195INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2196INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2197INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2198 2199namespace llvm { 2200 Pass *createLoopVectorizePass() { 2201 return new LoopVectorize(); 2202 } 2203} 2204 2205 2206